macro-assembler-sve-aarch64.cc - duckstation - duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one

macro-assembler-sve-aarch64.cc (83794B)
      1 // Copyright 2019, VIXL authors
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are met:
      6 //
      7 //   * Redistributions of source code must retain the above copyright notice,
      8 //     this list of conditions and the following disclaimer.
      9 //   * Redistributions in binary form must reproduce the above copyright notice,
     10 //     this list of conditions and the following disclaimer in the documentation
     11 //     and/or other materials provided with the distribution.
     12 //   * Neither the name of ARM Limited nor the names of its contributors may be
     13 //     used to endorse or promote products derived from this software without
     14 //     specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
     17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
     20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 
     27 #include "macro-assembler-aarch64.h"
     28 
     29 namespace vixl {
     30 namespace aarch64 {
     31 
     32 void MacroAssembler::AddSubHelper(AddSubHelperOption option,
     33                                   const ZRegister& zd,
     34                                   const ZRegister& zn,
     35                                   IntegerOperand imm) {
     36   VIXL_ASSERT(imm.FitsInLane(zd));
     37 
     38   // Simple, encodable cases.
     39   if (TrySingleAddSub(option, zd, zn, imm)) return;
     40 
     41   VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
     42   bool add_imm = (option == kAddImmediate);
     43 
     44   // Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
     45   // instruction. Also interpret the immediate as signed, so we can convert
     46   // Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
     47   IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
     48   if (signed_imm.IsNegative()) {
     49     AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
     50     IntegerOperand n_imm(signed_imm.GetMagnitude());
     51     // IntegerOperand can represent -INT_MIN, so this is always safe.
     52     VIXL_ASSERT(n_imm.IsPositiveOrZero());
     53     if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
     54   }
     55 
     56   // Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
     57   UseScratchRegisterScope temps(this);
     58   ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
     59   Dup(scratch, imm);
     60 
     61   SingleEmissionCheckScope guard(this);
     62   if (add_imm) {
     63     add(zd, zn, scratch);
     64   } else {
     65     sub(zd, zn, scratch);
     66   }
     67 }
     68 
     69 bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
     70                                      const ZRegister& zd,
     71                                      const ZRegister& zn,
     72                                      IntegerOperand imm) {
     73   VIXL_ASSERT(imm.FitsInLane(zd));
     74 
     75   int imm8;
     76   int shift = -1;
     77   if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
     78       imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
     79     MovprfxHelperScope guard(this, zd, zn);
     80     switch (option) {
     81       case kAddImmediate:
     82         add(zd, zd, imm8, shift);
     83         return true;
     84       case kSubImmediate:
     85         sub(zd, zd, imm8, shift);
     86         return true;
     87     }
     88   }
     89   return false;
     90 }
     91 
     92 void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn,
     93                                       SVEArithPredicatedFn reg_macro,
     94                                       const ZRegister& zd,
     95                                       const ZRegister& zn,
     96                                       IntegerOperand imm,
     97                                       bool is_signed) {
     98   if (is_signed) {
     99     // E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
    100     if (imm.IsInt8()) {
    101       MovprfxHelperScope guard(this, zd, zn);
    102       (this->*imm_fn)(zd, zd, imm.AsInt8());
    103       return;
    104     }
    105   } else {
    106     // E.g. UMIN_z_zi, UMAX_z_zi
    107     if (imm.IsUint8()) {
    108       MovprfxHelperScope guard(this, zd, zn);
    109       (this->*imm_fn)(zd, zd, imm.AsUint8());
    110       return;
    111     }
    112   }
    113 
    114   UseScratchRegisterScope temps(this);
    115   PRegister pg = temps.AcquireGoverningP();
    116   Ptrue(pg.WithSameLaneSizeAs(zd));
    117 
    118   // Try to re-use zd if we can, so we can avoid a movprfx.
    119   ZRegister scratch =
    120       zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
    121                      : zd;
    122   Dup(scratch, imm);
    123 
    124   // The vector-form macro for commutative operations will swap the arguments to
    125   // avoid movprfx, if necessary.
    126   (this->*reg_macro)(zd, pg.Merging(), zn, scratch);
    127 }
    128 
    129 void MacroAssembler::Mul(const ZRegister& zd,
    130                          const ZRegister& zn,
    131                          IntegerOperand imm) {
    132   VIXL_ASSERT(allow_macro_instructions_);
    133   IntArithImmFn imm_fn = &Assembler::mul;
    134   SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
    135   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
    136 }
    137 
    138 void MacroAssembler::Smin(const ZRegister& zd,
    139                           const ZRegister& zn,
    140                           IntegerOperand imm) {
    141   VIXL_ASSERT(allow_macro_instructions_);
    142   VIXL_ASSERT(imm.FitsInSignedLane(zd));
    143   IntArithImmFn imm_fn = &Assembler::smin;
    144   SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
    145   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
    146 }
    147 
    148 void MacroAssembler::Smax(const ZRegister& zd,
    149                           const ZRegister& zn,
    150                           IntegerOperand imm) {
    151   VIXL_ASSERT(allow_macro_instructions_);
    152   VIXL_ASSERT(imm.FitsInSignedLane(zd));
    153   IntArithImmFn imm_fn = &Assembler::smax;
    154   SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
    155   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
    156 }
    157 
    158 void MacroAssembler::Umax(const ZRegister& zd,
    159                           const ZRegister& zn,
    160                           IntegerOperand imm) {
    161   VIXL_ASSERT(allow_macro_instructions_);
    162   VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
    163   IntArithImmFn imm_fn = &Assembler::umax;
    164   SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
    165   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
    166 }
    167 
    168 void MacroAssembler::Umin(const ZRegister& zd,
    169                           const ZRegister& zn,
    170                           IntegerOperand imm) {
    171   VIXL_ASSERT(allow_macro_instructions_);
    172   VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
    173   IntArithImmFn imm_fn = &Assembler::umin;
    174   SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
    175   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
    176 }
    177 
    178 void MacroAssembler::Addpl(const Register& xd,
    179                            const Register& xn,
    180                            int64_t multiplier) {
    181   VIXL_ASSERT(allow_macro_instructions_);
    182 
    183   // This macro relies on `Rdvl` to handle some out-of-range cases. Check that
    184   // `VL * multiplier` cannot overflow, for any possible value of VL.
    185   VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
    186   VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
    187 
    188   if (xd.IsZero()) return;
    189   if (xn.IsZero() && xd.IsSP()) {
    190     // TODO: This operation doesn't make much sense, but we could support it
    191     // with a scratch register if necessary.
    192     VIXL_UNIMPLEMENTED();
    193   }
    194 
    195   // Handling xzr requires an extra move, so defer it until later so we can try
    196   // to use `rdvl` instead (via `Addvl`).
    197   if (IsInt6(multiplier) && !xn.IsZero()) {
    198     SingleEmissionCheckScope guard(this);
    199     addpl(xd, xn, static_cast<int>(multiplier));
    200     return;
    201   }
    202 
    203   // If `multiplier` is a multiple of 8, we can use `Addvl` instead.
    204   if ((multiplier % kZRegBitsPerPRegBit) == 0) {
    205     Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
    206     return;
    207   }
    208 
    209   if (IsInt6(multiplier)) {
    210     VIXL_ASSERT(xn.IsZero());  // Other cases were handled with `addpl`.
    211     // There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
    212     // materialise a zero.
    213     MacroEmissionCheckScope guard(this);
    214     movz(xd, 0);
    215     addpl(xd, xd, static_cast<int>(multiplier));
    216     return;
    217   }
    218 
    219   // TODO: Some probable cases result in rather long sequences. For example,
    220   // `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
    221   // outside the encodable range. We should look for ways to cover such cases
    222   // without drastically increasing the complexity of this logic.
    223 
    224   // For other cases, calculate xn + (PL * multiplier) using discrete
    225   // instructions. This requires two scratch registers in the general case, so
    226   // try to re-use the destination as a scratch register.
    227   UseScratchRegisterScope temps(this);
    228   temps.Include(xd);
    229   temps.Exclude(xn);
    230 
    231   Register scratch = temps.AcquireX();
    232   // Because there is no `rdpl`, so we have to calculate PL from VL. We can't
    233   // scale the multiplier because (we already know) it isn't a multiple of 8.
    234   Rdvl(scratch, multiplier);
    235 
    236   MacroEmissionCheckScope guard(this);
    237   if (xn.IsZero()) {
    238     asr(xd, scratch, kZRegBitsPerPRegBitLog2);
    239   } else if (xd.IsSP() || xn.IsSP()) {
    240     // TODO: MacroAssembler::Add should be able to handle this.
    241     asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
    242     add(xd, xn, scratch);
    243   } else {
    244     add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
    245   }
    246 }
    247 
    248 void MacroAssembler::Addvl(const Register& xd,
    249                            const Register& xn,
    250                            int64_t multiplier) {
    251   VIXL_ASSERT(allow_macro_instructions_);
    252   VIXL_ASSERT(xd.IsX());
    253   VIXL_ASSERT(xn.IsX());
    254 
    255   // Check that `VL * multiplier` cannot overflow, for any possible value of VL.
    256   VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
    257   VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
    258 
    259   if (xd.IsZero()) return;
    260   if (xn.IsZero() && xd.IsSP()) {
    261     // TODO: This operation doesn't make much sense, but we could support it
    262     // with a scratch register if necessary. `rdvl` cannot write into `sp`.
    263     VIXL_UNIMPLEMENTED();
    264   }
    265 
    266   if (IsInt6(multiplier)) {
    267     SingleEmissionCheckScope guard(this);
    268     if (xn.IsZero()) {
    269       rdvl(xd, static_cast<int>(multiplier));
    270     } else {
    271       addvl(xd, xn, static_cast<int>(multiplier));
    272     }
    273     return;
    274   }
    275 
    276   // TODO: Some probable cases result in rather long sequences. For example,
    277   // `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
    278   // outside the encodable range. We should look for ways to cover such cases
    279   // without drastically increasing the complexity of this logic.
    280 
    281   // For other cases, calculate xn + (VL * multiplier) using discrete
    282   // instructions. This requires two scratch registers in the general case, so
    283   // we try to re-use the destination as a scratch register.
    284   UseScratchRegisterScope temps(this);
    285   temps.Include(xd);
    286   temps.Exclude(xn);
    287 
    288   Register a = temps.AcquireX();
    289   Mov(a, multiplier);
    290 
    291   MacroEmissionCheckScope guard(this);
    292   Register b = temps.AcquireX();
    293   rdvl(b, 1);
    294   if (xn.IsZero()) {
    295     mul(xd, a, b);
    296   } else if (xd.IsSP() || xn.IsSP()) {
    297     mul(a, a, b);
    298     add(xd, xn, a);
    299   } else {
    300     madd(xd, a, b, xn);
    301   }
    302 }
    303 
    304 void MacroAssembler::CalculateSVEAddress(const Register& xd,
    305                                          const SVEMemOperand& addr,
    306                                          int vl_divisor_log2) {
    307   VIXL_ASSERT(allow_macro_instructions_);
    308   VIXL_ASSERT(!addr.IsScatterGather());
    309   VIXL_ASSERT(xd.IsX());
    310 
    311   // The lower bound is where a whole Z register is accessed.
    312   VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
    313   // The upper bound is for P register accesses, and for instructions like
    314   // "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
    315   VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));
    316 
    317   SVEOffsetModifier mod = addr.GetOffsetModifier();
    318   Register base = addr.GetScalarBase();
    319 
    320   if (addr.IsEquivalentToScalar()) {
    321     // For example:
    322     //   [x0]
    323     //   [x0, #0]
    324     //   [x0, xzr, LSL 2]
    325     Mov(xd, base);
    326   } else if (addr.IsScalarPlusImmediate()) {
    327     // For example:
    328     //   [x0, #42]
    329     //   [x0, #42, MUL VL]
    330     int64_t offset = addr.GetImmediateOffset();
    331     VIXL_ASSERT(offset != 0);  // Handled by IsEquivalentToScalar.
    332     if (addr.IsMulVl()) {
    333       int vl_divisor = 1 << vl_divisor_log2;
    334       // For all possible values of vl_divisor, we can simply use `Addpl`. This
    335       // will select `addvl` if necessary.
    336       VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
    337       Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
    338     } else {
    339       // IsScalarPlusImmediate() ensures that no other modifiers can occur.
    340       VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
    341       Add(xd, base, offset);
    342     }
    343   } else if (addr.IsScalarPlusScalar()) {
    344     // For example:
    345     //   [x0, x1]
    346     //   [x0, x1, LSL #4]
    347     Register offset = addr.GetScalarOffset();
    348     VIXL_ASSERT(!offset.IsZero());  // Handled by IsEquivalentToScalar.
    349     if (mod == SVE_LSL) {
    350       Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
    351     } else {
    352       // IsScalarPlusScalar() ensures that no other modifiers can occur.
    353       VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
    354       Add(xd, base, offset);
    355     }
    356   } else {
    357     // All other forms are scatter-gather addresses, which cannot be evaluated
    358     // into an X register.
    359     VIXL_UNREACHABLE();
    360   }
    361 }
    362 
    363 void MacroAssembler::Cpy(const ZRegister& zd,
    364                          const PRegister& pg,
    365                          IntegerOperand imm) {
    366   VIXL_ASSERT(allow_macro_instructions_);
    367   VIXL_ASSERT(imm.FitsInLane(zd));
    368   int imm8;
    369   int shift;
    370   if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
    371       imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
    372     SingleEmissionCheckScope guard(this);
    373     cpy(zd, pg, imm8, shift);
    374     return;
    375   }
    376 
    377   // The fallbacks rely on `cpy` variants that only support merging predication.
    378   // If zeroing predication was requested, zero the destination first.
    379   if (pg.IsZeroing()) {
    380     SingleEmissionCheckScope guard(this);
    381     dup(zd, 0);
    382   }
    383   PRegisterM pg_m = pg.Merging();
    384 
    385   // Try to encode the immediate using fcpy.
    386   VIXL_ASSERT(imm.FitsInLane(zd));
    387   if (zd.GetLaneSizeInBits() >= kHRegSize) {
    388     double fp_imm = 0.0;
    389     switch (zd.GetLaneSizeInBits()) {
    390       case kHRegSize:
    391         fp_imm =
    392             FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
    393         break;
    394       case kSRegSize:
    395         fp_imm = RawbitsToFloat(imm.AsUint32());
    396         break;
    397       case kDRegSize:
    398         fp_imm = RawbitsToDouble(imm.AsUint64());
    399         break;
    400       default:
    401         VIXL_UNREACHABLE();
    402         break;
    403     }
    404     // IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
    405     // we can use IsImmFP64 for all lane sizes.
    406     if (IsImmFP64(fp_imm)) {
    407       SingleEmissionCheckScope guard(this);
    408       fcpy(zd, pg_m, fp_imm);
    409       return;
    410     }
    411   }
    412 
    413   // Fall back to using a scratch register.
    414   UseScratchRegisterScope temps(this);
    415   Register scratch = temps.AcquireRegisterToHoldLane(zd);
    416   Mov(scratch, imm);
    417 
    418   SingleEmissionCheckScope guard(this);
    419   cpy(zd, pg_m, scratch);
    420 }
    421 
    422 // TODO: We implement Fcpy (amongst other things) for all FP types because it
    423 // allows us to preserve user-specified NaNs. We should come up with some
    424 // FPImmediate type to abstract this, and avoid all the duplication below (and
    425 // elsewhere).
    426 
    427 void MacroAssembler::Fcpy(const ZRegister& zd,
    428                           const PRegisterM& pg,
    429                           double imm) {
    430   VIXL_ASSERT(allow_macro_instructions_);
    431   VIXL_ASSERT(pg.IsMerging());
    432 
    433   if (IsImmFP64(imm)) {
    434     SingleEmissionCheckScope guard(this);
    435     fcpy(zd, pg, imm);
    436     return;
    437   }
    438 
    439   // As a fall-back, cast the immediate to the required lane size, and try to
    440   // encode the bit pattern using `Cpy`.
    441   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
    442 }
    443 
    444 void MacroAssembler::Fcpy(const ZRegister& zd,
    445                           const PRegisterM& pg,
    446                           float imm) {
    447   VIXL_ASSERT(allow_macro_instructions_);
    448   VIXL_ASSERT(pg.IsMerging());
    449 
    450   if (IsImmFP32(imm)) {
    451     SingleEmissionCheckScope guard(this);
    452     fcpy(zd, pg, imm);
    453     return;
    454   }
    455 
    456   // As a fall-back, cast the immediate to the required lane size, and try to
    457   // encode the bit pattern using `Cpy`.
    458   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
    459 }
    460 
    461 void MacroAssembler::Fcpy(const ZRegister& zd,
    462                           const PRegisterM& pg,
    463                           Float16 imm) {
    464   VIXL_ASSERT(allow_macro_instructions_);
    465   VIXL_ASSERT(pg.IsMerging());
    466 
    467   if (IsImmFP16(imm)) {
    468     SingleEmissionCheckScope guard(this);
    469     fcpy(zd, pg, imm);
    470     return;
    471   }
    472 
    473   // As a fall-back, cast the immediate to the required lane size, and try to
    474   // encode the bit pattern using `Cpy`.
    475   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
    476 }
    477 
    478 void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
    479   VIXL_ASSERT(allow_macro_instructions_);
    480   VIXL_ASSERT(imm.FitsInLane(zd));
    481   unsigned lane_size = zd.GetLaneSizeInBits();
    482   int imm8;
    483   int shift;
    484   if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
    485       imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
    486     SingleEmissionCheckScope guard(this);
    487     dup(zd, imm8, shift);
    488   } else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
    489     SingleEmissionCheckScope guard(this);
    490     dupm(zd, imm.AsUintN(lane_size));
    491   } else {
    492     UseScratchRegisterScope temps(this);
    493     Register scratch = temps.AcquireRegisterToHoldLane(zd);
    494     Mov(scratch, imm);
    495 
    496     SingleEmissionCheckScope guard(this);
    497     dup(zd, scratch);
    498   }
    499 }
    500 
    501 void MacroAssembler::NoncommutativeArithmeticHelper(
    502     const ZRegister& zd,
    503     const PRegisterM& pg,
    504     const ZRegister& zn,
    505     const ZRegister& zm,
    506     SVEArithPredicatedFn fn,
    507     SVEArithPredicatedFn rev_fn) {
    508   if (zd.Aliases(zn)) {
    509     // E.g. zd = zd / zm
    510     SingleEmissionCheckScope guard(this);
    511     (this->*fn)(zd, pg, zn, zm);
    512   } else if (zd.Aliases(zm)) {
    513     // E.g. zd = zn / zd
    514     SingleEmissionCheckScope guard(this);
    515     (this->*rev_fn)(zd, pg, zm, zn);
    516   } else {
    517     // E.g. zd = zn / zm
    518     MovprfxHelperScope guard(this, zd, pg, zn);
    519     (this->*fn)(zd, pg, zd, zm);
    520   }
    521 }
    522 
    523 void MacroAssembler::FPCommutativeArithmeticHelper(
    524     const ZRegister& zd,
    525     const PRegisterM& pg,
    526     const ZRegister& zn,
    527     const ZRegister& zm,
    528     SVEArithPredicatedFn fn,
    529     FPMacroNaNPropagationOption nan_option) {
    530   ResolveFPNaNPropagationOption(&nan_option);
    531 
    532   if (zd.Aliases(zn)) {
    533     SingleEmissionCheckScope guard(this);
    534     (this->*fn)(zd, pg, zd, zm);
    535   } else if (zd.Aliases(zm)) {
    536     switch (nan_option) {
    537       case FastNaNPropagation: {
    538         // Swap the arguments.
    539         SingleEmissionCheckScope guard(this);
    540         (this->*fn)(zd, pg, zd, zn);
    541         return;
    542       }
    543       case StrictNaNPropagation: {
    544         UseScratchRegisterScope temps(this);
    545         // Use a scratch register to keep the argument order exactly as
    546         // specified.
    547         ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
    548         {
    549           MovprfxHelperScope guard(this, scratch, pg, zn);
    550           (this->*fn)(scratch, pg, scratch, zm);
    551         }
    552         Mov(zd, scratch);
    553         return;
    554       }
    555       case NoFPMacroNaNPropagationSelected:
    556         VIXL_UNREACHABLE();
    557         return;
    558     }
    559   } else {
    560     MovprfxHelperScope guard(this, zd, pg, zn);
    561     (this->*fn)(zd, pg, zd, zm);
    562   }
    563 }
    564 
    565 // Instructions of the form "inst zda, zn, zm, #num", where they are
    566 // non-commutative and no reversed form is provided.
    567 #define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \
    568   V(Cmla, cmla)                              \
    569   V(Sqrdcmlah, sqrdcmlah)
    570 
    571 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                     \
    572   void MacroAssembler::MASMFN(const ZRegister& zd,               \
    573                               const ZRegister& za,               \
    574                               const ZRegister& zn,               \
    575                               const ZRegister& zm,               \
    576                               int imm) {                         \
    577     if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
    578       UseScratchRegisterScope temps(this);                       \
    579       VIXL_ASSERT(AreSameLaneSize(zn, zm));                      \
    580       ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);  \
    581       Mov(ztmp, zd.Aliases(zn) ? zn : zm);                       \
    582       MovprfxHelperScope guard(this, zd, za);                    \
    583       ASMFN(zd,                                                  \
    584             (zd.Aliases(zn) ? ztmp : zn),                        \
    585             (zd.Aliases(zm) ? ztmp : zm),                        \
    586             imm);                                                \
    587     } else {                                                     \
    588       MovprfxHelperScope guard(this, zd, za);                    \
    589       ASMFN(zd, zn, zm, imm);                                    \
    590     }                                                            \
    591   }
    592 VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC)
    593 #undef VIXL_DEFINE_MASM_FUNC
    594 
    595 // Instructions of the form "inst zda, zn, zm, #num, #num", where they are
    596 // non-commutative and no reversed form is provided.
    597 #define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \
    598   V(Cmla, cmla)                               \
    599   V(Sqrdcmlah, sqrdcmlah)
    600 
    601 // This doesn't handle zm when it's out of the range that can be encoded in
    602 // instruction. The range depends on element size: z0-z7 for H, z0-15 for S.
    603 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                     \
    604   void MacroAssembler::MASMFN(const ZRegister& zd,               \
    605                               const ZRegister& za,               \
    606                               const ZRegister& zn,               \
    607                               const ZRegister& zm,               \
    608                               int index,                         \
    609                               int rot) {                         \
    610     if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
    611       UseScratchRegisterScope temps(this);                       \
    612       ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);  \
    613       {                                                          \
    614         MovprfxHelperScope guard(this, ztmp, za);                \
    615         ASMFN(ztmp, zn, zm, index, rot);                         \
    616       }                                                          \
    617       Mov(zd, ztmp);                                             \
    618     } else {                                                     \
    619       MovprfxHelperScope guard(this, zd, za);                    \
    620       ASMFN(zd, zn, zm, index, rot);                             \
    621     }                                                            \
    622   }
    623 VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)
    624 #undef VIXL_DEFINE_MASM_FUNC
    625 
    626 // Instructions of the form "inst zda, pg, zda, zn", where they are
    627 // non-commutative and no reversed form is provided.
    628 #define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \
    629   V(Addp, addp)                             \
    630   V(Bic, bic)                               \
    631   V(Faddp, faddp)                           \
    632   V(Fmaxnmp, fmaxnmp)                       \
    633   V(Fminnmp, fminnmp)                       \
    634   V(Fmaxp, fmaxp)                           \
    635   V(Fminp, fminp)                           \
    636   V(Fscale, fscale)                         \
    637   V(Smaxp, smaxp)                           \
    638   V(Sminp, sminp)                           \
    639   V(Suqadd, suqadd)                         \
    640   V(Umaxp, umaxp)                           \
    641   V(Uminp, uminp)                           \
    642   V(Usqadd, usqadd)
    643 
    644 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                       \
    645   void MacroAssembler::MASMFN(const ZRegister& zd,                 \
    646                               const PRegisterM& pg,                \
    647                               const ZRegister& zn,                 \
    648                               const ZRegister& zm) {               \
    649     VIXL_ASSERT(allow_macro_instructions_);                        \
    650     if (zd.Aliases(zm) && !zd.Aliases(zn)) {                       \
    651       UseScratchRegisterScope temps(this);                         \
    652       ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \
    653       Mov(scratch, zm);                                            \
    654       MovprfxHelperScope guard(this, zd, pg, zn);                  \
    655       ASMFN(zd, pg, zd, scratch);                                  \
    656     } else {                                                       \
    657       MovprfxHelperScope guard(this, zd, pg, zn);                  \
    658       ASMFN(zd, pg, zd, zm);                                       \
    659     }                                                              \
    660   }
    661 VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
    662 #undef VIXL_DEFINE_MASM_FUNC
    663 
    664 // Instructions of the form "inst zda, pg, zda, zn", where they are
    665 // non-commutative and a reversed form is provided.
    666 #define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \
    667   V(Asr, asr)                                       \
    668   V(Fdiv, fdiv)                                     \
    669   V(Fsub, fsub)                                     \
    670   V(Lsl, lsl)                                       \
    671   V(Lsr, lsr)                                       \
    672   V(Sdiv, sdiv)                                     \
    673   V(Shsub, shsub)                                   \
    674   V(Sqrshl, sqrshl)                                 \
    675   V(Sqshl, sqshl)                                   \
    676   V(Sqsub, sqsub)                                   \
    677   V(Srshl, srshl)                                   \
    678   V(Sub, sub)                                       \
    679   V(Udiv, udiv)                                     \
    680   V(Uhsub, uhsub)                                   \
    681   V(Uqrshl, uqrshl)                                 \
    682   V(Uqshl, uqshl)                                   \
    683   V(Uqsub, uqsub)                                   \
    684   V(Urshl, urshl)
    685 
    686 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                          \
    687   void MacroAssembler::MASMFN(const ZRegister& zd,                    \
    688                               const PRegisterM& pg,                   \
    689                               const ZRegister& zn,                    \
    690                               const ZRegister& zm) {                  \
    691     VIXL_ASSERT(allow_macro_instructions_);                           \
    692     NoncommutativeArithmeticHelper(zd,                                \
    693                                    pg,                                \
    694                                    zn,                                \
    695                                    zm,                                \
    696                                    static_cast<SVEArithPredicatedFn>( \
    697                                        &Assembler::ASMFN),            \
    698                                    static_cast<SVEArithPredicatedFn>( \
    699                                        &Assembler::ASMFN##r));        \
    700   }
    701 VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
    702 #undef VIXL_DEFINE_MASM_FUNC
    703 
    704 void MacroAssembler::Fadd(const ZRegister& zd,
    705                           const PRegisterM& pg,
    706                           const ZRegister& zn,
    707                           const ZRegister& zm,
    708                           FPMacroNaNPropagationOption nan_option) {
    709   VIXL_ASSERT(allow_macro_instructions_);
    710   FPCommutativeArithmeticHelper(zd,
    711                                 pg,
    712                                 zn,
    713                                 zm,
    714                                 static_cast<SVEArithPredicatedFn>(
    715                                     &Assembler::fadd),
    716                                 nan_option);
    717 }
    718 
    719 void MacroAssembler::Fabd(const ZRegister& zd,
    720                           const PRegisterM& pg,
    721                           const ZRegister& zn,
    722                           const ZRegister& zm,
    723                           FPMacroNaNPropagationOption nan_option) {
    724   VIXL_ASSERT(allow_macro_instructions_);
    725   FPCommutativeArithmeticHelper(zd,
    726                                 pg,
    727                                 zn,
    728                                 zm,
    729                                 static_cast<SVEArithPredicatedFn>(
    730                                     &Assembler::fabd),
    731                                 nan_option);
    732 }
    733 
    734 void MacroAssembler::Fmul(const ZRegister& zd,
    735                           const PRegisterM& pg,
    736                           const ZRegister& zn,
    737                           const ZRegister& zm,
    738                           FPMacroNaNPropagationOption nan_option) {
    739   VIXL_ASSERT(allow_macro_instructions_);
    740   FPCommutativeArithmeticHelper(zd,
    741                                 pg,
    742                                 zn,
    743                                 zm,
    744                                 static_cast<SVEArithPredicatedFn>(
    745                                     &Assembler::fmul),
    746                                 nan_option);
    747 }
    748 
    749 void MacroAssembler::Fmulx(const ZRegister& zd,
    750                            const PRegisterM& pg,
    751                            const ZRegister& zn,
    752                            const ZRegister& zm,
    753                            FPMacroNaNPropagationOption nan_option) {
    754   VIXL_ASSERT(allow_macro_instructions_);
    755   FPCommutativeArithmeticHelper(zd,
    756                                 pg,
    757                                 zn,
    758                                 zm,
    759                                 static_cast<SVEArithPredicatedFn>(
    760                                     &Assembler::fmulx),
    761                                 nan_option);
    762 }
    763 
    764 void MacroAssembler::Fmax(const ZRegister& zd,
    765                           const PRegisterM& pg,
    766                           const ZRegister& zn,
    767                           const ZRegister& zm,
    768                           FPMacroNaNPropagationOption nan_option) {
    769   VIXL_ASSERT(allow_macro_instructions_);
    770   FPCommutativeArithmeticHelper(zd,
    771                                 pg,
    772                                 zn,
    773                                 zm,
    774                                 static_cast<SVEArithPredicatedFn>(
    775                                     &Assembler::fmax),
    776                                 nan_option);
    777 }
    778 
    779 void MacroAssembler::Fmin(const ZRegister& zd,
    780                           const PRegisterM& pg,
    781                           const ZRegister& zn,
    782                           const ZRegister& zm,
    783                           FPMacroNaNPropagationOption nan_option) {
    784   VIXL_ASSERT(allow_macro_instructions_);
    785   FPCommutativeArithmeticHelper(zd,
    786                                 pg,
    787                                 zn,
    788                                 zm,
    789                                 static_cast<SVEArithPredicatedFn>(
    790                                     &Assembler::fmin),
    791                                 nan_option);
    792 }
    793 
    794 void MacroAssembler::Fmaxnm(const ZRegister& zd,
    795                             const PRegisterM& pg,
    796                             const ZRegister& zn,
    797                             const ZRegister& zm,
    798                             FPMacroNaNPropagationOption nan_option) {
    799   VIXL_ASSERT(allow_macro_instructions_);
    800   FPCommutativeArithmeticHelper(zd,
    801                                 pg,
    802                                 zn,
    803                                 zm,
    804                                 static_cast<SVEArithPredicatedFn>(
    805                                     &Assembler::fmaxnm),
    806                                 nan_option);
    807 }
    808 
    809 void MacroAssembler::Fminnm(const ZRegister& zd,
    810                             const PRegisterM& pg,
    811                             const ZRegister& zn,
    812                             const ZRegister& zm,
    813                             FPMacroNaNPropagationOption nan_option) {
    814   VIXL_ASSERT(allow_macro_instructions_);
    815   FPCommutativeArithmeticHelper(zd,
    816                                 pg,
    817                                 zn,
    818                                 zm,
    819                                 static_cast<SVEArithPredicatedFn>(
    820                                     &Assembler::fminnm),
    821                                 nan_option);
    822 }
    823 
    824 void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
    825   VIXL_ASSERT(allow_macro_instructions_);
    826 
    827   switch (zd.GetLaneSizeInBits()) {
    828     case kHRegSize:
    829       Fdup(zd, Float16(imm));
    830       break;
    831     case kSRegSize:
    832       Fdup(zd, static_cast<float>(imm));
    833       break;
    834     case kDRegSize:
    835       uint64_t bits = DoubleToRawbits(imm);
    836       if (IsImmFP64(bits)) {
    837         SingleEmissionCheckScope guard(this);
    838         fdup(zd, imm);
    839       } else {
    840         Dup(zd, bits);
    841       }
    842       break;
    843   }
    844 }
    845 
    846 void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
    847   VIXL_ASSERT(allow_macro_instructions_);
    848 
    849   switch (zd.GetLaneSizeInBits()) {
    850     case kHRegSize:
    851       Fdup(zd, Float16(imm));
    852       break;
    853     case kSRegSize:
    854       if (IsImmFP32(imm)) {
    855         SingleEmissionCheckScope guard(this);
    856         fdup(zd, imm);
    857       } else {
    858         Dup(zd, FloatToRawbits(imm));
    859       }
    860       break;
    861     case kDRegSize:
    862       Fdup(zd, static_cast<double>(imm));
    863       break;
    864   }
    865 }
    866 
    867 void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
    868   VIXL_ASSERT(allow_macro_instructions_);
    869 
    870   switch (zd.GetLaneSizeInBits()) {
    871     case kHRegSize:
    872       if (IsImmFP16(imm)) {
    873         SingleEmissionCheckScope guard(this);
    874         fdup(zd, imm);
    875       } else {
    876         Dup(zd, Float16ToRawbits(imm));
    877       }
    878       break;
    879     case kSRegSize:
    880       Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
    881       break;
    882     case kDRegSize:
    883       Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
    884       break;
    885   }
    886 }
    887 
    888 void MacroAssembler::Index(const ZRegister& zd,
    889                            const Operand& start,
    890                            const Operand& step) {
    891   class IndexOperand : public Operand {
    892    public:
    893     static IndexOperand Prepare(MacroAssembler* masm,
    894                                 UseScratchRegisterScope* temps,
    895                                 const Operand& op,
    896                                 const ZRegister& zd_inner) {
    897       // Look for encodable immediates.
    898       int imm;
    899       if (op.IsImmediate()) {
    900         if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) {
    901           return IndexOperand(imm);
    902         }
    903         Register scratch = temps->AcquireRegisterToHoldLane(zd_inner);
    904         masm->Mov(scratch, op);
    905         return IndexOperand(scratch);
    906       } else {
    907         // Plain registers can be encoded directly.
    908         VIXL_ASSERT(op.IsPlainRegister());
    909         return IndexOperand(op.GetRegister());
    910       }
    911     }
    912 
    913     int GetImm5() const {
    914       int64_t imm = GetImmediate();
    915       VIXL_ASSERT(IsInt5(imm));
    916       return static_cast<int>(imm);
    917     }
    918 
    919    private:
    920     explicit IndexOperand(const Register& reg) : Operand(reg) {}
    921     explicit IndexOperand(int64_t imm) : Operand(imm) {}
    922   };
    923 
    924   UseScratchRegisterScope temps(this);
    925   IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
    926   IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);
    927 
    928   SingleEmissionCheckScope guard(this);
    929   if (start_enc.IsImmediate()) {
    930     if (step_enc.IsImmediate()) {
    931       index(zd, start_enc.GetImm5(), step_enc.GetImm5());
    932     } else {
    933       index(zd, start_enc.GetImm5(), step_enc.GetRegister());
    934     }
    935   } else {
    936     if (step_enc.IsImmediate()) {
    937       index(zd, start_enc.GetRegister(), step_enc.GetImm5());
    938     } else {
    939       index(zd, start_enc.GetRegister(), step_enc.GetRegister());
    940     }
    941   }
    942 }
    943 
    944 void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
    945   VIXL_ASSERT(allow_macro_instructions_);
    946   VIXL_ASSERT(imm.FitsInLane(zdn));
    947 
    948   if (imm.IsZero()) {
    949     SingleEmissionCheckScope guard(this);
    950     insr(zdn, xzr);
    951     return;
    952   }
    953 
    954   UseScratchRegisterScope temps(this);
    955   Register scratch = temps.AcquireRegisterToHoldLane(zdn);
    956 
    957   // TODO: There are many cases where we could optimise immediates, such as by
    958   // detecting repeating patterns or FP immediates. We should optimise and
    959   // abstract this for use in other SVE mov-immediate-like macros.
    960   Mov(scratch, imm);
    961 
    962   SingleEmissionCheckScope guard(this);
    963   insr(zdn, scratch);
    964 }
    965 
    966 void MacroAssembler::Mla(const ZRegister& zd,
    967                          const PRegisterM& pg,
    968                          const ZRegister& za,
    969                          const ZRegister& zn,
    970                          const ZRegister& zm) {
    971   VIXL_ASSERT(allow_macro_instructions_);
    972   if (zd.Aliases(za)) {
    973     // zda = zda + (zn * zm)
    974     SingleEmissionCheckScope guard(this);
    975     mla(zd, pg, zn, zm);
    976   } else if (zd.Aliases(zn)) {
    977     // zdn = za + (zdn * zm)
    978     SingleEmissionCheckScope guard(this);
    979     mad(zd, pg, zm, za);
    980   } else if (zd.Aliases(zm)) {
    981     // Multiplication is commutative, so we can swap zn and zm.
    982     // zdm = za + (zdm * zn)
    983     SingleEmissionCheckScope guard(this);
    984     mad(zd, pg, zn, za);
    985   } else {
    986     // zd = za + (zn * zm)
    987     ExactAssemblyScope guard(this, 2 * kInstructionSize);
    988     movprfx(zd, pg, za);
    989     mla(zd, pg, zn, zm);
    990   }
    991 }
    992 
    993 void MacroAssembler::Mls(const ZRegister& zd,
    994                          const PRegisterM& pg,
    995                          const ZRegister& za,
    996                          const ZRegister& zn,
    997                          const ZRegister& zm) {
    998   VIXL_ASSERT(allow_macro_instructions_);
    999   if (zd.Aliases(za)) {
   1000     // zda = zda - (zn * zm)
   1001     SingleEmissionCheckScope guard(this);
   1002     mls(zd, pg, zn, zm);
   1003   } else if (zd.Aliases(zn)) {
   1004     // zdn = za - (zdn * zm)
   1005     SingleEmissionCheckScope guard(this);
   1006     msb(zd, pg, zm, za);
   1007   } else if (zd.Aliases(zm)) {
   1008     // Multiplication is commutative, so we can swap zn and zm.
   1009     // zdm = za - (zdm * zn)
   1010     SingleEmissionCheckScope guard(this);
   1011     msb(zd, pg, zn, za);
   1012   } else {
   1013     // zd = za - (zn * zm)
   1014     ExactAssemblyScope guard(this, 2 * kInstructionSize);
   1015     movprfx(zd, pg, za);
   1016     mls(zd, pg, zn, zm);
   1017   }
   1018 }
   1019 
   1020 void MacroAssembler::CompareHelper(Condition cond,
   1021                                    const PRegisterWithLaneSize& pd,
   1022                                    const PRegisterZ& pg,
   1023                                    const ZRegister& zn,
   1024                                    IntegerOperand imm) {
   1025   UseScratchRegisterScope temps(this);
   1026   ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
   1027   Dup(zm, imm);
   1028   SingleEmissionCheckScope guard(this);
   1029   cmp(cond, pd, pg, zn, zm);
   1030 }
   1031 
   1032 void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
   1033                             const PRegister& pg,
   1034                             const PRegisterWithLaneSize& pn) {
   1035   VIXL_ASSERT(allow_macro_instructions_);
   1036   VIXL_ASSERT(pd.IsLaneSizeB());
   1037   VIXL_ASSERT(pn.IsLaneSizeB());
   1038   if (pd.Is(pn)) {
   1039     SingleEmissionCheckScope guard(this);
   1040     pfirst(pd, pg, pn);
   1041   } else {
   1042     UseScratchRegisterScope temps(this);
   1043     PRegister temp_pg = pg;
   1044     if (pd.Aliases(pg)) {
   1045       temp_pg = temps.AcquireP();
   1046       Mov(temp_pg.VnB(), pg.VnB());
   1047     }
   1048     Mov(pd, pn);
   1049     SingleEmissionCheckScope guard(this);
   1050     pfirst(pd, temp_pg, pd);
   1051   }
   1052 }
   1053 
   1054 void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
   1055                            const PRegister& pg,
   1056                            const PRegisterWithLaneSize& pn) {
   1057   VIXL_ASSERT(allow_macro_instructions_);
   1058   VIXL_ASSERT(AreSameFormat(pd, pn));
   1059   if (pd.Is(pn)) {
   1060     SingleEmissionCheckScope guard(this);
   1061     pnext(pd, pg, pn);
   1062   } else {
   1063     UseScratchRegisterScope temps(this);
   1064     PRegister temp_pg = pg;
   1065     if (pd.Aliases(pg)) {
   1066       temp_pg = temps.AcquireP();
   1067       Mov(temp_pg.VnB(), pg.VnB());
   1068     }
   1069     Mov(pd.VnB(), pn.VnB());
   1070     SingleEmissionCheckScope guard(this);
   1071     pnext(pd, temp_pg, pd);
   1072   }
   1073 }
   1074 
   1075 void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
   1076                            SVEPredicateConstraint pattern,
   1077                            FlagsUpdate s) {
   1078   VIXL_ASSERT(allow_macro_instructions_);
   1079   switch (s) {
   1080     case LeaveFlags:
   1081       Ptrue(pd, pattern);
   1082       return;
   1083     case SetFlags:
   1084       Ptrues(pd, pattern);
   1085       return;
   1086   }
   1087   VIXL_UNREACHABLE();
   1088 }
   1089 
   1090 void MacroAssembler::Sub(const ZRegister& zd,
   1091                          IntegerOperand imm,
   1092                          const ZRegister& zm) {
   1093   VIXL_ASSERT(allow_macro_instructions_);
   1094 
   1095   int imm8;
   1096   int shift = -1;
   1097   if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
   1098       imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
   1099     MovprfxHelperScope guard(this, zd, zm);
   1100     subr(zd, zd, imm8, shift);
   1101   } else {
   1102     UseScratchRegisterScope temps(this);
   1103     ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
   1104     Dup(scratch, imm);
   1105 
   1106     SingleEmissionCheckScope guard(this);
   1107     sub(zd, scratch, zm);
   1108   }
   1109 }
   1110 
   1111 void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
   1112                                                const PRegisterZ& pg,
   1113                                                const SVEMemOperand& addr,
   1114                                                SVELoadBroadcastFn fn,
   1115                                                int divisor) {
   1116   VIXL_ASSERT(addr.IsScalarPlusImmediate());
   1117   int64_t imm = addr.GetImmediateOffset();
   1118   if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
   1119     SingleEmissionCheckScope guard(this);
   1120     (this->*fn)(zt, pg, addr);
   1121   } else {
   1122     UseScratchRegisterScope temps(this);
   1123     Register scratch = temps.AcquireX();
   1124     CalculateSVEAddress(scratch, addr, zt);
   1125     SingleEmissionCheckScope guard(this);
   1126     (this->*fn)(zt, pg, SVEMemOperand(scratch));
   1127   }
   1128 }
   1129 
   1130 void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
   1131                                                  const SVEMemOperand& addr,
   1132                                                  SVELoadStoreFn fn) {
   1133   VIXL_ASSERT(allow_macro_instructions_);
   1134   VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());
   1135 
   1136   if (addr.IsPlainScalar() ||
   1137       (addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
   1138        addr.IsMulVl())) {
   1139     SingleEmissionCheckScope guard(this);
   1140     (this->*fn)(rt, addr);
   1141     return;
   1142   }
   1143 
   1144   if (addr.IsEquivalentToScalar()) {
   1145     SingleEmissionCheckScope guard(this);
   1146     (this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
   1147     return;
   1148   }
   1149 
   1150   UseScratchRegisterScope temps(this);
   1151   Register scratch = temps.AcquireX();
   1152   CalculateSVEAddress(scratch, addr, rt);
   1153   SingleEmissionCheckScope guard(this);
   1154   (this->*fn)(rt, SVEMemOperand(scratch));
   1155 }
   1156 
   1157 template <typename Tg, typename Tf>
   1158 void MacroAssembler::SVELoadStoreNTBroadcastQOHelper(
   1159     const ZRegister& zt,
   1160     const Tg& pg,
   1161     const SVEMemOperand& addr,
   1162     Tf fn,
   1163     int imm_bits,
   1164     int shift_amount,
   1165     SVEOffsetModifier supported_modifier,
   1166     int vl_divisor_log2) {
   1167   VIXL_ASSERT(allow_macro_instructions_);
   1168   int imm_divisor = 1 << shift_amount;
   1169 
   1170   if (addr.IsPlainScalar() ||
   1171       (addr.IsScalarPlusImmediate() &&
   1172        IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
   1173        ((addr.GetImmediateOffset() % imm_divisor) == 0) &&
   1174        (addr.GetOffsetModifier() == supported_modifier))) {
   1175     SingleEmissionCheckScope guard(this);
   1176     (this->*fn)(zt, pg, addr);
   1177     return;
   1178   }
   1179 
   1180   if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
   1181       addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) {
   1182     SingleEmissionCheckScope guard(this);
   1183     (this->*fn)(zt, pg, addr);
   1184     return;
   1185   }
   1186 
   1187   if (addr.IsEquivalentToScalar()) {
   1188     SingleEmissionCheckScope guard(this);
   1189     (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
   1190     return;
   1191   }
   1192 
   1193   if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
   1194       (vl_divisor_log2 == -1)) {
   1195     // We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
   1196     // dependent.
   1197     VIXL_UNIMPLEMENTED();
   1198   }
   1199 
   1200   UseScratchRegisterScope temps(this);
   1201   Register scratch = temps.AcquireX();
   1202   CalculateSVEAddress(scratch, addr, vl_divisor_log2);
   1203   SingleEmissionCheckScope guard(this);
   1204   (this->*fn)(zt, pg, SVEMemOperand(scratch));
   1205 }
   1206 
   1207 template <typename Tg, typename Tf>
   1208 void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
   1209                                          const ZRegister& zt,
   1210                                          const Tg& pg,
   1211                                          const SVEMemOperand& addr,
   1212                                          Tf fn) {
   1213   if (addr.IsPlainScalar() ||
   1214       (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
   1215        addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
   1216       (addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
   1217        addr.IsMulVl())) {
   1218     SingleEmissionCheckScope guard(this);
   1219     (this->*fn)(zt, pg, addr);
   1220     return;
   1221   }
   1222 
   1223   if (addr.IsEquivalentToScalar()) {
   1224     SingleEmissionCheckScope guard(this);
   1225     (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
   1226     return;
   1227   }
   1228 
   1229   if (addr.IsVectorPlusImmediate()) {
   1230     uint64_t offset = addr.GetImmediateOffset();
   1231     if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
   1232         IsUint5(offset >> msize_in_bytes_log2)) {
   1233       SingleEmissionCheckScope guard(this);
   1234       (this->*fn)(zt, pg, addr);
   1235       return;
   1236     }
   1237   }
   1238 
   1239   if (addr.IsScalarPlusVector()) {
   1240     VIXL_ASSERT(addr.IsScatterGather());
   1241     SingleEmissionCheckScope guard(this);
   1242     (this->*fn)(zt, pg, addr);
   1243     return;
   1244   }
   1245 
   1246   UseScratchRegisterScope temps(this);
   1247   if (addr.IsScatterGather()) {
   1248     // In scatter-gather modes, zt and zn/zm have the same lane size. However,
   1249     // for 32-bit accesses, the result of each lane's address calculation still
   1250     // requires 64 bits; we can't naively use `Adr` for the address calculation
   1251     // because it would truncate each address to 32 bits.
   1252 
   1253     if (addr.IsVectorPlusImmediate()) {
   1254       // Synthesise the immediate in an X register, then use a
   1255       // scalar-plus-vector access with the original vector.
   1256       Register scratch = temps.AcquireX();
   1257       Mov(scratch, addr.GetImmediateOffset());
   1258       SingleEmissionCheckScope guard(this);
   1259       SVEOffsetModifier om =
   1260           zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
   1261       (this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
   1262       return;
   1263     }
   1264 
   1265     VIXL_UNIMPLEMENTED();
   1266   } else {
   1267     Register scratch = temps.AcquireX();
   1268     // TODO: If we have an immediate offset that is a multiple of
   1269     // msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
   1270     // save an instruction.
   1271     int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
   1272     CalculateSVEAddress(scratch, addr, vl_divisor_log2);
   1273     SingleEmissionCheckScope guard(this);
   1274     (this->*fn)(zt, pg, SVEMemOperand(scratch));
   1275   }
   1276 }
   1277 
   1278 template <typename Tf>
   1279 void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
   1280                                      const ZRegister& zt,
   1281                                      const PRegisterZ& pg,
   1282                                      const SVEMemOperand& addr,
   1283                                      Tf fn) {
   1284   if (addr.IsScatterGather()) {
   1285     // Scatter-gather first-fault loads share encodings with normal loads.
   1286     SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
   1287     return;
   1288   }
   1289 
   1290   // Contiguous first-faulting loads have no scalar-plus-immediate form at all,
   1291   // so we don't do immediate synthesis.
   1292 
   1293   // We cannot currently distinguish "[x0]" from "[x0, #0]", and this
   1294   // is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
   1295   if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
   1296                                addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
   1297     SingleEmissionCheckScope guard(this);
   1298     (this->*fn)(zt, pg, addr);
   1299     return;
   1300   }
   1301 
   1302   VIXL_UNIMPLEMENTED();
   1303 }
   1304 
   1305 void MacroAssembler::Ld1b(const ZRegister& zt,
   1306                           const PRegisterZ& pg,
   1307                           const SVEMemOperand& addr) {
   1308   VIXL_ASSERT(allow_macro_instructions_);
   1309   SVELoadStore1Helper(kBRegSizeInBytesLog2,
   1310                       zt,
   1311                       pg,
   1312                       addr,
   1313                       static_cast<SVELoad1Fn>(&Assembler::ld1b));
   1314 }
   1315 
   1316 void MacroAssembler::Ld1h(const ZRegister& zt,
   1317                           const PRegisterZ& pg,
   1318                           const SVEMemOperand& addr) {
   1319   VIXL_ASSERT(allow_macro_instructions_);
   1320   SVELoadStore1Helper(kHRegSizeInBytesLog2,
   1321                       zt,
   1322                       pg,
   1323                       addr,
   1324                       static_cast<SVELoad1Fn>(&Assembler::ld1h));
   1325 }
   1326 
   1327 void MacroAssembler::Ld1w(const ZRegister& zt,
   1328                           const PRegisterZ& pg,
   1329                           const SVEMemOperand& addr) {
   1330   VIXL_ASSERT(allow_macro_instructions_);
   1331   SVELoadStore1Helper(kWRegSizeInBytesLog2,
   1332                       zt,
   1333                       pg,
   1334                       addr,
   1335                       static_cast<SVELoad1Fn>(&Assembler::ld1w));
   1336 }
   1337 
   1338 void MacroAssembler::Ld1d(const ZRegister& zt,
   1339                           const PRegisterZ& pg,
   1340                           const SVEMemOperand& addr) {
   1341   VIXL_ASSERT(allow_macro_instructions_);
   1342   SVELoadStore1Helper(kDRegSizeInBytesLog2,
   1343                       zt,
   1344                       pg,
   1345                       addr,
   1346                       static_cast<SVELoad1Fn>(&Assembler::ld1d));
   1347 }
   1348 
   1349 void MacroAssembler::Ld1sb(const ZRegister& zt,
   1350                            const PRegisterZ& pg,
   1351                            const SVEMemOperand& addr) {
   1352   VIXL_ASSERT(allow_macro_instructions_);
   1353   SVELoadStore1Helper(kBRegSizeInBytesLog2,
   1354                       zt,
   1355                       pg,
   1356                       addr,
   1357                       static_cast<SVELoad1Fn>(&Assembler::ld1sb));
   1358 }
   1359 
   1360 void MacroAssembler::Ld1sh(const ZRegister& zt,
   1361                            const PRegisterZ& pg,
   1362                            const SVEMemOperand& addr) {
   1363   VIXL_ASSERT(allow_macro_instructions_);
   1364   SVELoadStore1Helper(kHRegSizeInBytesLog2,
   1365                       zt,
   1366                       pg,
   1367                       addr,
   1368                       static_cast<SVELoad1Fn>(&Assembler::ld1sh));
   1369 }
   1370 
   1371 void MacroAssembler::Ld1sw(const ZRegister& zt,
   1372                            const PRegisterZ& pg,
   1373                            const SVEMemOperand& addr) {
   1374   VIXL_ASSERT(allow_macro_instructions_);
   1375   SVELoadStore1Helper(kSRegSizeInBytesLog2,
   1376                       zt,
   1377                       pg,
   1378                       addr,
   1379                       static_cast<SVELoad1Fn>(&Assembler::ld1sw));
   1380 }
   1381 
   1382 void MacroAssembler::St1b(const ZRegister& zt,
   1383                           const PRegister& pg,
   1384                           const SVEMemOperand& addr) {
   1385   VIXL_ASSERT(allow_macro_instructions_);
   1386   SVELoadStore1Helper(kBRegSizeInBytesLog2,
   1387                       zt,
   1388                       pg,
   1389                       addr,
   1390                       static_cast<SVEStore1Fn>(&Assembler::st1b));
   1391 }
   1392 
   1393 void MacroAssembler::St1h(const ZRegister& zt,
   1394                           const PRegister& pg,
   1395                           const SVEMemOperand& addr) {
   1396   VIXL_ASSERT(allow_macro_instructions_);
   1397   SVELoadStore1Helper(kHRegSizeInBytesLog2,
   1398                       zt,
   1399                       pg,
   1400                       addr,
   1401                       static_cast<SVEStore1Fn>(&Assembler::st1h));
   1402 }
   1403 
   1404 void MacroAssembler::St1w(const ZRegister& zt,
   1405                           const PRegister& pg,
   1406                           const SVEMemOperand& addr) {
   1407   VIXL_ASSERT(allow_macro_instructions_);
   1408   SVELoadStore1Helper(kSRegSizeInBytesLog2,
   1409                       zt,
   1410                       pg,
   1411                       addr,
   1412                       static_cast<SVEStore1Fn>(&Assembler::st1w));
   1413 }
   1414 
   1415 void MacroAssembler::St1d(const ZRegister& zt,
   1416                           const PRegister& pg,
   1417                           const SVEMemOperand& addr) {
   1418   VIXL_ASSERT(allow_macro_instructions_);
   1419   SVELoadStore1Helper(kDRegSizeInBytesLog2,
   1420                       zt,
   1421                       pg,
   1422                       addr,
   1423                       static_cast<SVEStore1Fn>(&Assembler::st1d));
   1424 }
   1425 
   1426 void MacroAssembler::Ldff1b(const ZRegister& zt,
   1427                             const PRegisterZ& pg,
   1428                             const SVEMemOperand& addr) {
   1429   VIXL_ASSERT(allow_macro_instructions_);
   1430   SVELoadFFHelper(kBRegSizeInBytesLog2,
   1431                   zt,
   1432                   pg,
   1433                   addr,
   1434                   static_cast<SVELoad1Fn>(&Assembler::ldff1b));
   1435 }
   1436 
   1437 void MacroAssembler::Ldff1h(const ZRegister& zt,
   1438                             const PRegisterZ& pg,
   1439                             const SVEMemOperand& addr) {
   1440   VIXL_ASSERT(allow_macro_instructions_);
   1441   SVELoadFFHelper(kHRegSizeInBytesLog2,
   1442                   zt,
   1443                   pg,
   1444                   addr,
   1445                   static_cast<SVELoad1Fn>(&Assembler::ldff1h));
   1446 }
   1447 
   1448 void MacroAssembler::Ldff1w(const ZRegister& zt,
   1449                             const PRegisterZ& pg,
   1450                             const SVEMemOperand& addr) {
   1451   VIXL_ASSERT(allow_macro_instructions_);
   1452   SVELoadFFHelper(kSRegSizeInBytesLog2,
   1453                   zt,
   1454                   pg,
   1455                   addr,
   1456                   static_cast<SVELoad1Fn>(&Assembler::ldff1w));
   1457 }
   1458 
   1459 void MacroAssembler::Ldff1d(const ZRegister& zt,
   1460                             const PRegisterZ& pg,
   1461                             const SVEMemOperand& addr) {
   1462   VIXL_ASSERT(allow_macro_instructions_);
   1463   SVELoadFFHelper(kDRegSizeInBytesLog2,
   1464                   zt,
   1465                   pg,
   1466                   addr,
   1467                   static_cast<SVELoad1Fn>(&Assembler::ldff1d));
   1468 }
   1469 
   1470 void MacroAssembler::Ldff1sb(const ZRegister& zt,
   1471                              const PRegisterZ& pg,
   1472                              const SVEMemOperand& addr) {
   1473   VIXL_ASSERT(allow_macro_instructions_);
   1474   SVELoadFFHelper(kBRegSizeInBytesLog2,
   1475                   zt,
   1476                   pg,
   1477                   addr,
   1478                   static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
   1479 }
   1480 
   1481 void MacroAssembler::Ldff1sh(const ZRegister& zt,
   1482                              const PRegisterZ& pg,
   1483                              const SVEMemOperand& addr) {
   1484   VIXL_ASSERT(allow_macro_instructions_);
   1485   SVELoadFFHelper(kHRegSizeInBytesLog2,
   1486                   zt,
   1487                   pg,
   1488                   addr,
   1489                   static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
   1490 }
   1491 
   1492 void MacroAssembler::Ldff1sw(const ZRegister& zt,
   1493                              const PRegisterZ& pg,
   1494                              const SVEMemOperand& addr) {
   1495   VIXL_ASSERT(allow_macro_instructions_);
   1496   SVELoadFFHelper(kSRegSizeInBytesLog2,
   1497                   zt,
   1498                   pg,
   1499                   addr,
   1500                   static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
   1501 }
   1502 
   1503 #define VIXL_SVE_LD1R_LIST(V) \
   1504   V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5)
   1505 
   1506 #define VIXL_DEFINE_MASM_FUNC(SZ, SH)                          \
   1507   void MacroAssembler::Ld1r##SZ(const ZRegister& zt,           \
   1508                                 const PRegisterZ& pg,          \
   1509                                 const SVEMemOperand& addr) {   \
   1510     VIXL_ASSERT(allow_macro_instructions_);                    \
   1511     SVELoadStoreNTBroadcastQOHelper(zt,                        \
   1512                                     pg,                        \
   1513                                     addr,                      \
   1514                                     &MacroAssembler::ld1r##SZ, \
   1515                                     4,                         \
   1516                                     SH,                        \
   1517                                     NO_SVE_OFFSET_MODIFIER,    \
   1518                                     -1);                       \
   1519   }
   1520 
   1521 VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)
   1522 
   1523 #undef VIXL_DEFINE_MASM_FUNC
   1524 #undef VIXL_SVE_LD1R_LIST
   1525 
   1526 void MacroAssembler::Ldnt1b(const ZRegister& zt,
   1527                             const PRegisterZ& pg,
   1528                             const SVEMemOperand& addr) {
   1529   VIXL_ASSERT(allow_macro_instructions_);
   1530   if (addr.IsVectorPlusScalar()) {
   1531     SingleEmissionCheckScope guard(this);
   1532     ldnt1b(zt, pg, addr);
   1533   } else {
   1534     SVELoadStoreNTBroadcastQOHelper(zt,
   1535                                     pg,
   1536                                     addr,
   1537                                     &MacroAssembler::ldnt1b,
   1538                                     4,
   1539                                     0,
   1540                                     SVE_MUL_VL);
   1541   }
   1542 }
   1543 
   1544 void MacroAssembler::Ldnt1d(const ZRegister& zt,
   1545                             const PRegisterZ& pg,
   1546                             const SVEMemOperand& addr) {
   1547   VIXL_ASSERT(allow_macro_instructions_);
   1548   if (addr.IsVectorPlusScalar()) {
   1549     SingleEmissionCheckScope guard(this);
   1550     ldnt1d(zt, pg, addr);
   1551   } else {
   1552     SVELoadStoreNTBroadcastQOHelper(zt,
   1553                                     pg,
   1554                                     addr,
   1555                                     &MacroAssembler::ldnt1d,
   1556                                     4,
   1557                                     0,
   1558                                     SVE_MUL_VL);
   1559   }
   1560 }
   1561 
   1562 void MacroAssembler::Ldnt1h(const ZRegister& zt,
   1563                             const PRegisterZ& pg,
   1564                             const SVEMemOperand& addr) {
   1565   VIXL_ASSERT(allow_macro_instructions_);
   1566   if (addr.IsVectorPlusScalar()) {
   1567     SingleEmissionCheckScope guard(this);
   1568     ldnt1h(zt, pg, addr);
   1569   } else {
   1570     SVELoadStoreNTBroadcastQOHelper(zt,
   1571                                     pg,
   1572                                     addr,
   1573                                     &MacroAssembler::ldnt1h,
   1574                                     4,
   1575                                     0,
   1576                                     SVE_MUL_VL);
   1577   }
   1578 }
   1579 
   1580 void MacroAssembler::Ldnt1w(const ZRegister& zt,
   1581                             const PRegisterZ& pg,
   1582                             const SVEMemOperand& addr) {
   1583   VIXL_ASSERT(allow_macro_instructions_);
   1584   if (addr.IsVectorPlusScalar()) {
   1585     SingleEmissionCheckScope guard(this);
   1586     ldnt1w(zt, pg, addr);
   1587   } else {
   1588     SVELoadStoreNTBroadcastQOHelper(zt,
   1589                                     pg,
   1590                                     addr,
   1591                                     &MacroAssembler::ldnt1w,
   1592                                     4,
   1593                                     0,
   1594                                     SVE_MUL_VL);
   1595   }
   1596 }
   1597 
   1598 void MacroAssembler::Stnt1b(const ZRegister& zt,
   1599                             const PRegister& pg,
   1600                             const SVEMemOperand& addr) {
   1601   VIXL_ASSERT(allow_macro_instructions_);
   1602   if (addr.IsVectorPlusScalar()) {
   1603     SingleEmissionCheckScope guard(this);
   1604     stnt1b(zt, pg, addr);
   1605   } else {
   1606     SVELoadStoreNTBroadcastQOHelper(zt,
   1607                                     pg,
   1608                                     addr,
   1609                                     &MacroAssembler::stnt1b,
   1610                                     4,
   1611                                     0,
   1612                                     SVE_MUL_VL);
   1613   }
   1614 }
   1615 void MacroAssembler::Stnt1d(const ZRegister& zt,
   1616                             const PRegister& pg,
   1617                             const SVEMemOperand& addr) {
   1618   VIXL_ASSERT(allow_macro_instructions_);
   1619   if (addr.IsVectorPlusScalar()) {
   1620     SingleEmissionCheckScope guard(this);
   1621     stnt1d(zt, pg, addr);
   1622   } else {
   1623     SVELoadStoreNTBroadcastQOHelper(zt,
   1624                                     pg,
   1625                                     addr,
   1626                                     &MacroAssembler::stnt1d,
   1627                                     4,
   1628                                     0,
   1629                                     SVE_MUL_VL);
   1630   }
   1631 }
   1632 void MacroAssembler::Stnt1h(const ZRegister& zt,
   1633                             const PRegister& pg,
   1634                             const SVEMemOperand& addr) {
   1635   VIXL_ASSERT(allow_macro_instructions_);
   1636   if (addr.IsVectorPlusScalar()) {
   1637     SingleEmissionCheckScope guard(this);
   1638     stnt1h(zt, pg, addr);
   1639   } else {
   1640     SVELoadStoreNTBroadcastQOHelper(zt,
   1641                                     pg,
   1642                                     addr,
   1643                                     &MacroAssembler::stnt1h,
   1644                                     4,
   1645                                     0,
   1646                                     SVE_MUL_VL);
   1647   }
   1648 }
   1649 void MacroAssembler::Stnt1w(const ZRegister& zt,
   1650                             const PRegister& pg,
   1651                             const SVEMemOperand& addr) {
   1652   VIXL_ASSERT(allow_macro_instructions_);
   1653   if (addr.IsVectorPlusScalar()) {
   1654     SingleEmissionCheckScope guard(this);
   1655     stnt1w(zt, pg, addr);
   1656   } else {
   1657     SVELoadStoreNTBroadcastQOHelper(zt,
   1658                                     pg,
   1659                                     addr,
   1660                                     &MacroAssembler::stnt1w,
   1661                                     4,
   1662                                     0,
   1663                                     SVE_MUL_VL);
   1664   }
   1665 }
   1666 
   1667 void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn,
   1668                                        const ZRegister& zd,
   1669                                        const ZRegister& za,
   1670                                        const ZRegister& zn,
   1671                                        const ZRegister& zm,
   1672                                        int index) {
   1673   if (zd.Aliases(za)) {
   1674     // zda = zda + (zn . zm)
   1675     SingleEmissionCheckScope guard(this);
   1676     (this->*fn)(zd, zn, zm, index);
   1677 
   1678   } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
   1679     // zdn = za + (zdn . zm[index])
   1680     // zdm = za + (zn . zdm[index])
   1681     // zdnm = za + (zdnm . zdnm[index])
   1682     UseScratchRegisterScope temps(this);
   1683     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
   1684     {
   1685       MovprfxHelperScope guard(this, scratch, za);
   1686       (this->*fn)(scratch, zn, zm, index);
   1687     }
   1688 
   1689     Mov(zd, scratch);
   1690   } else {
   1691     // zd = za + (zn . zm)
   1692     MovprfxHelperScope guard(this, zd, za);
   1693     (this->*fn)(zd, zn, zm, index);
   1694   }
   1695 }
   1696 
   1697 void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn,
   1698                                               const ZRegister& zd,
   1699                                               const ZRegister& za,
   1700                                               const ZRegister& zn,
   1701                                               const ZRegister& zm) {
   1702   if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
   1703     // zd = za . zd . zm
   1704     // zd = za . zn . zd
   1705     // zd = za . zd . zd
   1706     UseScratchRegisterScope temps(this);
   1707     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
   1708     {
   1709       MovprfxHelperScope guard(this, scratch, za);
   1710       (this->*fn)(scratch, zn, zm);
   1711     }
   1712 
   1713     Mov(zd, scratch);
   1714   } else {
   1715     MovprfxHelperScope guard(this, zd, za);
   1716     (this->*fn)(zd, zn, zm);
   1717   }
   1718 }
   1719 
   1720 void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn,
   1721                                               const ZRegister& zd,
   1722                                               const ZRegister& za,
   1723                                               const ZRegister& zn,
   1724                                               const ZRegister& zm) {
   1725   if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
   1726     // zd = za . zd . zm
   1727     // zd = za . zn . zd
   1728     // zd = za . zd . zd
   1729     UseScratchRegisterScope temps(this);
   1730     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
   1731     {
   1732       MovprfxHelperScope guard(this, scratch, za);
   1733       (this->*fn)(scratch, scratch, zn, zm);
   1734     }
   1735 
   1736     Mov(zd, scratch);
   1737   } else {
   1738     MovprfxHelperScope guard(this, zd, za);
   1739     (this->*fn)(zd, zd, zn, zm);
   1740   }
   1741 }
   1742 
   1743 void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn,
   1744                                                     const ZRegister& zd,
   1745                                                     const ZRegister& za,
   1746                                                     const ZRegister& zn,
   1747                                                     const ZRegister& zm,
   1748                                                     int imm) {
   1749   if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
   1750     // zd = za . zd . zm[i]
   1751     // zd = za . zn . zd[i]
   1752     // zd = za . zd . zd[i]
   1753     UseScratchRegisterScope temps(this);
   1754     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
   1755     {
   1756       MovprfxHelperScope guard(this, scratch, za);
   1757       (this->*fn)(scratch, zn, zm, imm);
   1758     }
   1759 
   1760     Mov(zd, scratch);
   1761   } else {
   1762     // zd = za . zn . zm[i]
   1763     MovprfxHelperScope guard(this, zd, za);
   1764     (this->*fn)(zd, zn, zm, imm);
   1765   }
   1766 }
   1767 
   1768 void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn,
   1769                                                   const ZRegister& zd,
   1770                                                   const ZRegister& za,
   1771                                                   const ZRegister& zn,
   1772                                                   const ZRegister& zm) {
   1773   if (zn.Aliases(zm)) {
   1774     // If zn == zm, the difference is zero.
   1775     if (!zd.Aliases(za)) {
   1776       Mov(zd, za);
   1777     }
   1778   } else if (zd.Aliases(za)) {
   1779     SingleEmissionCheckScope guard(this);
   1780     (this->*fn)(zd, zn, zm);
   1781   } else if (zd.Aliases(zn)) {
   1782     UseScratchRegisterScope temps(this);
   1783     ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
   1784     Mov(ztmp, zn);
   1785     MovprfxHelperScope guard(this, zd, za);
   1786     (this->*fn)(zd, ztmp, zm);
   1787   } else if (zd.Aliases(zm)) {
   1788     UseScratchRegisterScope temps(this);
   1789     ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
   1790     Mov(ztmp, zm);
   1791     MovprfxHelperScope guard(this, zd, za);
   1792     (this->*fn)(zd, zn, ztmp);
   1793   } else {
   1794     MovprfxHelperScope guard(this, zd, za);
   1795     (this->*fn)(zd, zn, zm);
   1796   }
   1797 }
   1798 
   1799 #define VIXL_SVE_4REG_LIST(V)                       \
   1800   V(Saba, saba, AbsoluteDifferenceAccumulate)       \
   1801   V(Uaba, uaba, AbsoluteDifferenceAccumulate)       \
   1802   V(Sabalb, sabalb, AbsoluteDifferenceAccumulate)   \
   1803   V(Sabalt, sabalt, AbsoluteDifferenceAccumulate)   \
   1804   V(Uabalb, uabalb, AbsoluteDifferenceAccumulate)   \
   1805   V(Uabalt, uabalt, AbsoluteDifferenceAccumulate)   \
   1806   V(Sdot, sdot, FourRegDestructiveHelper)           \
   1807   V(Udot, udot, FourRegDestructiveHelper)           \
   1808   V(Adclb, adclb, FourRegDestructiveHelper)         \
   1809   V(Adclt, adclt, FourRegDestructiveHelper)         \
   1810   V(Sbclb, sbclb, FourRegDestructiveHelper)         \
   1811   V(Sbclt, sbclt, FourRegDestructiveHelper)         \
   1812   V(Smlalb, smlalb, FourRegDestructiveHelper)       \
   1813   V(Smlalt, smlalt, FourRegDestructiveHelper)       \
   1814   V(Smlslb, smlslb, FourRegDestructiveHelper)       \
   1815   V(Smlslt, smlslt, FourRegDestructiveHelper)       \
   1816   V(Umlalb, umlalb, FourRegDestructiveHelper)       \
   1817   V(Umlalt, umlalt, FourRegDestructiveHelper)       \
   1818   V(Umlslb, umlslb, FourRegDestructiveHelper)       \
   1819   V(Umlslt, umlslt, FourRegDestructiveHelper)       \
   1820   V(Bcax, bcax, FourRegDestructiveHelper)           \
   1821   V(Bsl, bsl, FourRegDestructiveHelper)             \
   1822   V(Bsl1n, bsl1n, FourRegDestructiveHelper)         \
   1823   V(Bsl2n, bsl2n, FourRegDestructiveHelper)         \
   1824   V(Eor3, eor3, FourRegDestructiveHelper)           \
   1825   V(Nbsl, nbsl, FourRegDestructiveHelper)           \
   1826   V(Fmlalb, fmlalb, FourRegDestructiveHelper)       \
   1827   V(Fmlalt, fmlalt, FourRegDestructiveHelper)       \
   1828   V(Fmlslb, fmlslb, FourRegDestructiveHelper)       \
   1829   V(Fmlslt, fmlslt, FourRegDestructiveHelper)       \
   1830   V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper)   \
   1831   V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \
   1832   V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper)   \
   1833   V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper)   \
   1834   V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \
   1835   V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper)   \
   1836   V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper)   \
   1837   V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper)   \
   1838   V(Fmmla, fmmla, FourRegDestructiveHelper)         \
   1839   V(Smmla, smmla, FourRegDestructiveHelper)         \
   1840   V(Ummla, ummla, FourRegDestructiveHelper)         \
   1841   V(Usmmla, usmmla, FourRegDestructiveHelper)       \
   1842   V(Usdot, usdot, FourRegDestructiveHelper)
   1843 
   1844 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
   1845   void MacroAssembler::MASMFN(const ZRegister& zd,   \
   1846                               const ZRegister& za,   \
   1847                               const ZRegister& zn,   \
   1848                               const ZRegister& zm) { \
   1849     VIXL_ASSERT(allow_macro_instructions_);          \
   1850     HELPER(&Assembler::ASMFN, zd, za, zn, zm);       \
   1851   }
   1852 VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC)
   1853 #undef VIXL_DEFINE_MASM_FUNC
   1854 
   1855 #define VIXL_SVE_4REG_1IMM_LIST(V)                      \
   1856   V(Fmla, fmla, FourRegOneImmDestructiveHelper)         \
   1857   V(Fmls, fmls, FourRegOneImmDestructiveHelper)         \
   1858   V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper)     \
   1859   V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper)     \
   1860   V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper)     \
   1861   V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper)     \
   1862   V(Mla, mla, FourRegOneImmDestructiveHelper)           \
   1863   V(Mls, mls, FourRegOneImmDestructiveHelper)           \
   1864   V(Smlalb, smlalb, FourRegOneImmDestructiveHelper)     \
   1865   V(Smlalt, smlalt, FourRegOneImmDestructiveHelper)     \
   1866   V(Smlslb, smlslb, FourRegOneImmDestructiveHelper)     \
   1867   V(Smlslt, smlslt, FourRegOneImmDestructiveHelper)     \
   1868   V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \
   1869   V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \
   1870   V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \
   1871   V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \
   1872   V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \
   1873   V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \
   1874   V(Umlalb, umlalb, FourRegOneImmDestructiveHelper)     \
   1875   V(Umlalt, umlalt, FourRegOneImmDestructiveHelper)     \
   1876   V(Umlslb, umlslb, FourRegOneImmDestructiveHelper)     \
   1877   V(Umlslt, umlslt, FourRegOneImmDestructiveHelper)
   1878 
   1879 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
   1880   void MacroAssembler::MASMFN(const ZRegister& zd,   \
   1881                               const ZRegister& za,   \
   1882                               const ZRegister& zn,   \
   1883                               const ZRegister& zm,   \
   1884                               int imm) {             \
   1885     VIXL_ASSERT(allow_macro_instructions_);          \
   1886     HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm);  \
   1887   }
   1888 VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)
   1889 #undef VIXL_DEFINE_MASM_FUNC
   1890 
   1891 void MacroAssembler::Sdot(const ZRegister& zd,
   1892                           const ZRegister& za,
   1893                           const ZRegister& zn,
   1894                           const ZRegister& zm,
   1895                           int index) {
   1896   VIXL_ASSERT(allow_macro_instructions_);
   1897   SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
   1898 }
   1899 
   1900 void MacroAssembler::Udot(const ZRegister& zd,
   1901                           const ZRegister& za,
   1902                           const ZRegister& zn,
   1903                           const ZRegister& zm,
   1904                           int index) {
   1905   VIXL_ASSERT(allow_macro_instructions_);
   1906   SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
   1907 }
   1908 
   1909 void MacroAssembler::Sudot(const ZRegister& zd,
   1910                            const ZRegister& za,
   1911                            const ZRegister& zn,
   1912                            const ZRegister& zm,
   1913                            int index) {
   1914   VIXL_ASSERT(allow_macro_instructions_);
   1915   SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index);
   1916 }
   1917 
   1918 void MacroAssembler::Usdot(const ZRegister& zd,
   1919                            const ZRegister& za,
   1920                            const ZRegister& zn,
   1921                            const ZRegister& zm,
   1922                            int index) {
   1923   VIXL_ASSERT(allow_macro_instructions_);
   1924   SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index);
   1925 }
   1926 
   1927 void MacroAssembler::Cdot(const ZRegister& zd,
   1928                           const ZRegister& za,
   1929                           const ZRegister& zn,
   1930                           const ZRegister& zm,
   1931                           int index,
   1932                           int rot) {
   1933   // This doesn't handle zm when it's out of the range that can be encoded in
   1934   // instruction. The range depends on element size: z0-z7 for B, z0-15 for H.
   1935   if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
   1936     UseScratchRegisterScope temps(this);
   1937     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
   1938     {
   1939       MovprfxHelperScope guard(this, ztmp, za);
   1940       cdot(ztmp, zn, zm, index, rot);
   1941     }
   1942     Mov(zd, ztmp);
   1943   } else {
   1944     MovprfxHelperScope guard(this, zd, za);
   1945     cdot(zd, zn, zm, index, rot);
   1946   }
   1947 }
   1948 
   1949 void MacroAssembler::Cdot(const ZRegister& zd,
   1950                           const ZRegister& za,
   1951                           const ZRegister& zn,
   1952                           const ZRegister& zm,
   1953                           int rot) {
   1954   if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
   1955     UseScratchRegisterScope temps(this);
   1956     VIXL_ASSERT(AreSameLaneSize(zn, zm));
   1957     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
   1958     Mov(ztmp, zd.Aliases(zn) ? zn : zm);
   1959     MovprfxHelperScope guard(this, zd, za);
   1960     cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot);
   1961   } else {
   1962     MovprfxHelperScope guard(this, zd, za);
   1963     cdot(zd, zn, zm, rot);
   1964   }
   1965 }
   1966 
   1967 void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
   1968                                     const PRegisterM& pg,
   1969                                     const ZRegister& za,
   1970                                     const ZRegister& zn,
   1971                                     const ZRegister& zm,
   1972                                     SVEMulAddPredicatedZdaFn fn_zda,
   1973                                     SVEMulAddPredicatedZdnFn fn_zdn,
   1974                                     FPMacroNaNPropagationOption nan_option) {
   1975   ResolveFPNaNPropagationOption(&nan_option);
   1976 
   1977   if (zd.Aliases(za)) {
   1978     // zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
   1979     SingleEmissionCheckScope guard(this);
   1980     (this->*fn_zda)(zd, pg, zn, zm);
   1981   } else if (zd.Aliases(zn)) {
   1982     // zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
   1983     SingleEmissionCheckScope guard(this);
   1984     (this->*fn_zdn)(zd, pg, zm, za);
   1985   } else if (zd.Aliases(zm)) {
   1986     switch (nan_option) {
   1987       case FastNaNPropagation: {
   1988         // We treat multiplication as commutative in the fast mode, so we can
   1989         // swap zn and zm.
   1990         // zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
   1991         SingleEmissionCheckScope guard(this);
   1992         (this->*fn_zdn)(zd, pg, zn, za);
   1993         return;
   1994       }
   1995       case StrictNaNPropagation: {
   1996         UseScratchRegisterScope temps(this);
   1997         // Use a scratch register to keep the argument order exactly as
   1998         // specified.
   1999         ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
   2000         {
   2001           MovprfxHelperScope guard(this, scratch, pg, za);
   2002           // scratch = (-)za + ((-)zn * zm)
   2003           (this->*fn_zda)(scratch, pg, zn, zm);
   2004         }
   2005         Mov(zd, scratch);
   2006         return;
   2007       }
   2008       case NoFPMacroNaNPropagationSelected:
   2009         VIXL_UNREACHABLE();
   2010         return;
   2011     }
   2012   } else {
   2013     // zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
   2014     MovprfxHelperScope guard(this, zd, pg, za);
   2015     (this->*fn_zda)(zd, pg, zn, zm);
   2016   }
   2017 }
   2018 
   2019 void MacroAssembler::Fmla(const ZRegister& zd,
   2020                           const PRegisterM& pg,
   2021                           const ZRegister& za,
   2022                           const ZRegister& zn,
   2023                           const ZRegister& zm,
   2024                           FPMacroNaNPropagationOption nan_option) {
   2025   VIXL_ASSERT(allow_macro_instructions_);
   2026   FPMulAddHelper(zd,
   2027                  pg,
   2028                  za,
   2029                  zn,
   2030                  zm,
   2031                  &Assembler::fmla,
   2032                  &Assembler::fmad,
   2033                  nan_option);
   2034 }
   2035 
   2036 void MacroAssembler::Fmls(const ZRegister& zd,
   2037                           const PRegisterM& pg,
   2038                           const ZRegister& za,
   2039                           const ZRegister& zn,
   2040                           const ZRegister& zm,
   2041                           FPMacroNaNPropagationOption nan_option) {
   2042   VIXL_ASSERT(allow_macro_instructions_);
   2043   FPMulAddHelper(zd,
   2044                  pg,
   2045                  za,
   2046                  zn,
   2047                  zm,
   2048                  &Assembler::fmls,
   2049                  &Assembler::fmsb,
   2050                  nan_option);
   2051 }
   2052 
   2053 void MacroAssembler::Fnmla(const ZRegister& zd,
   2054                            const PRegisterM& pg,
   2055                            const ZRegister& za,
   2056                            const ZRegister& zn,
   2057                            const ZRegister& zm,
   2058                            FPMacroNaNPropagationOption nan_option) {
   2059   VIXL_ASSERT(allow_macro_instructions_);
   2060   FPMulAddHelper(zd,
   2061                  pg,
   2062                  za,
   2063                  zn,
   2064                  zm,
   2065                  &Assembler::fnmla,
   2066                  &Assembler::fnmad,
   2067                  nan_option);
   2068 }
   2069 
   2070 void MacroAssembler::Fnmls(const ZRegister& zd,
   2071                            const PRegisterM& pg,
   2072                            const ZRegister& za,
   2073                            const ZRegister& zn,
   2074                            const ZRegister& zm,
   2075                            FPMacroNaNPropagationOption nan_option) {
   2076   VIXL_ASSERT(allow_macro_instructions_);
   2077   FPMulAddHelper(zd,
   2078                  pg,
   2079                  za,
   2080                  zn,
   2081                  zm,
   2082                  &Assembler::fnmls,
   2083                  &Assembler::fnmsb,
   2084                  nan_option);
   2085 }
   2086 
   2087 void MacroAssembler::Ftmad(const ZRegister& zd,
   2088                            const ZRegister& zn,
   2089                            const ZRegister& zm,
   2090                            int imm3) {
   2091   VIXL_ASSERT(allow_macro_instructions_);
   2092   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
   2093     UseScratchRegisterScope temps(this);
   2094     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
   2095     Mov(scratch, zm);
   2096     MovprfxHelperScope guard(this, zd, zn);
   2097     ftmad(zd, zd, scratch, imm3);
   2098   } else {
   2099     MovprfxHelperScope guard(this, zd, zn);
   2100     ftmad(zd, zd, zm, imm3);
   2101   }
   2102 }
   2103 
   2104 void MacroAssembler::Fcadd(const ZRegister& zd,
   2105                            const PRegisterM& pg,
   2106                            const ZRegister& zn,
   2107                            const ZRegister& zm,
   2108                            int rot) {
   2109   VIXL_ASSERT(allow_macro_instructions_);
   2110   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
   2111     UseScratchRegisterScope temps(this);
   2112     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
   2113     {
   2114       MovprfxHelperScope guard(this, scratch, pg, zn);
   2115       fcadd(scratch, pg, scratch, zm, rot);
   2116     }
   2117     Mov(zd, scratch);
   2118   } else {
   2119     MovprfxHelperScope guard(this, zd, pg, zn);
   2120     fcadd(zd, pg, zd, zm, rot);
   2121   }
   2122 }
   2123 
   2124 void MacroAssembler::Fcmla(const ZRegister& zd,
   2125                            const PRegisterM& pg,
   2126                            const ZRegister& za,
   2127                            const ZRegister& zn,
   2128                            const ZRegister& zm,
   2129                            int rot) {
   2130   VIXL_ASSERT(allow_macro_instructions_);
   2131   if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
   2132     UseScratchRegisterScope temps(this);
   2133     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
   2134     {
   2135       MovprfxHelperScope guard(this, ztmp, za);
   2136       fcmla(ztmp, pg, zn, zm, rot);
   2137     }
   2138     Mov(zd, pg, ztmp);
   2139   } else {
   2140     MovprfxHelperScope guard(this, zd, pg, za);
   2141     fcmla(zd, pg, zn, zm, rot);
   2142   }
   2143 }
   2144 
   2145 void MacroAssembler::Splice(const ZRegister& zd,
   2146                             const PRegister& pg,
   2147                             const ZRegister& zn,
   2148                             const ZRegister& zm) {
   2149   VIXL_ASSERT(allow_macro_instructions_);
   2150   if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) {
   2151     SingleEmissionCheckScope guard(this);
   2152     splice(zd, pg, zn, zm);
   2153   } else if (zd.Aliases(zm) && !zd.Aliases(zn)) {
   2154     UseScratchRegisterScope temps(this);
   2155     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
   2156     {
   2157       MovprfxHelperScope guard(this, scratch, zn);
   2158       splice(scratch, pg, scratch, zm);
   2159     }
   2160     Mov(zd, scratch);
   2161   } else {
   2162     MovprfxHelperScope guard(this, zd, zn);
   2163     splice(zd, pg, zd, zm);
   2164   }
   2165 }
   2166 
   2167 void MacroAssembler::Clasta(const ZRegister& zd,
   2168                             const PRegister& pg,
   2169                             const ZRegister& zn,
   2170                             const ZRegister& zm) {
   2171   VIXL_ASSERT(allow_macro_instructions_);
   2172   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
   2173     UseScratchRegisterScope temps(this);
   2174     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
   2175     {
   2176       MovprfxHelperScope guard(this, scratch, zn);
   2177       clasta(scratch, pg, scratch, zm);
   2178     }
   2179     Mov(zd, scratch);
   2180   } else {
   2181     MovprfxHelperScope guard(this, zd, zn);
   2182     clasta(zd, pg, zd, zm);
   2183   }
   2184 }
   2185 
   2186 void MacroAssembler::Clastb(const ZRegister& zd,
   2187                             const PRegister& pg,
   2188                             const ZRegister& zn,
   2189                             const ZRegister& zm) {
   2190   VIXL_ASSERT(allow_macro_instructions_);
   2191   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
   2192     UseScratchRegisterScope temps(this);
   2193     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
   2194     {
   2195       MovprfxHelperScope guard(this, scratch, zn);
   2196       clastb(scratch, pg, scratch, zm);
   2197     }
   2198     Mov(zd, scratch);
   2199   } else {
   2200     MovprfxHelperScope guard(this, zd, zn);
   2201     clastb(zd, pg, zd, zm);
   2202   }
   2203 }
   2204 
   2205 void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn,
   2206                                           const ZRegister& zd,
   2207                                           const ZRegister& za,
   2208                                           const ZRegister& zn,
   2209                                           int shift) {
   2210   VIXL_ASSERT(allow_macro_instructions_);
   2211   if (!zd.Aliases(za) && zd.Aliases(zn)) {
   2212     UseScratchRegisterScope temps(this);
   2213     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
   2214     Mov(ztmp, zn);
   2215     {
   2216       MovprfxHelperScope guard(this, zd, za);
   2217       (this->*fn)(zd, ztmp, shift);
   2218     }
   2219   } else {
   2220     MovprfxHelperScope guard(this, zd, za);
   2221     (this->*fn)(zd, zn, shift);
   2222   }
   2223 }
   2224 
   2225 void MacroAssembler::Srsra(const ZRegister& zd,
   2226                            const ZRegister& za,
   2227                            const ZRegister& zn,
   2228                            int shift) {
   2229   ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift);
   2230 }
   2231 
   2232 void MacroAssembler::Ssra(const ZRegister& zd,
   2233                           const ZRegister& za,
   2234                           const ZRegister& zn,
   2235                           int shift) {
   2236   ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift);
   2237 }
   2238 
   2239 void MacroAssembler::Ursra(const ZRegister& zd,
   2240                            const ZRegister& za,
   2241                            const ZRegister& zn,
   2242                            int shift) {
   2243   ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift);
   2244 }
   2245 
   2246 void MacroAssembler::Usra(const ZRegister& zd,
   2247                           const ZRegister& za,
   2248                           const ZRegister& zn,
   2249                           int shift) {
   2250   ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift);
   2251 }
   2252 
   2253 void MacroAssembler::ComplexAddition(ZZZImmFn fn,
   2254                                      const ZRegister& zd,
   2255                                      const ZRegister& zn,
   2256                                      const ZRegister& zm,
   2257                                      int rot) {
   2258   VIXL_ASSERT(allow_macro_instructions_);
   2259   if (!zd.Aliases(zn) && zd.Aliases(zm)) {
   2260     UseScratchRegisterScope temps(this);
   2261     ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm);
   2262     Mov(ztmp, zm);
   2263     {
   2264       MovprfxHelperScope guard(this, zd, zn);
   2265       (this->*fn)(zd, zd, ztmp, rot);
   2266     }
   2267   } else {
   2268     MovprfxHelperScope guard(this, zd, zn);
   2269     (this->*fn)(zd, zd, zm, rot);
   2270   }
   2271 }
   2272 
   2273 void MacroAssembler::Cadd(const ZRegister& zd,
   2274                           const ZRegister& zn,
   2275                           const ZRegister& zm,
   2276                           int rot) {
   2277   ComplexAddition(&Assembler::cadd, zd, zn, zm, rot);
   2278 }
   2279 
   2280 void MacroAssembler::Sqcadd(const ZRegister& zd,
   2281                             const ZRegister& zn,
   2282                             const ZRegister& zm,
   2283                             int rot) {
   2284   ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot);
   2285 }
   2286 
   2287 }  // namespace aarch64
   2288 }  // namespace vixl
	duckstation duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
	git clone https://git.neptards.moe/u3shit/duckstation.git
	Log \| Files \| Refs \| README \| LICENSE