duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

cpu_newrec_compiler_aarch64.cpp (70496B)


      1 // SPDX-FileCopyrightText: 2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "cpu_newrec_compiler_aarch64.h"
      5 #include "common/align.h"
      6 #include "common/assert.h"
      7 #include "common/log.h"
      8 #include "common/string_util.h"
      9 #include "cpu_core_private.h"
     10 #include "cpu_pgxp.h"
     11 #include "cpu_recompiler_thunks.h"
     12 #include "cpu_recompiler_types.h"
     13 #include "gte.h"
     14 #include "settings.h"
     15 #include "timing_event.h"
     16 #include <limits>
     17 
     18 #ifdef CPU_ARCH_ARM64
     19 
     20 Log_SetChannel(CPU::NewRec);
     21 
     22 #define PTR(x) vixl::aarch64::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))
     23 
     24 namespace CPU::NewRec {
     25 
     26 using namespace vixl::aarch64;
     27 
     28 using CPU::Recompiler::armEmitCall;
     29 using CPU::Recompiler::armEmitCondBranch;
     30 using CPU::Recompiler::armEmitFarLoad;
     31 using CPU::Recompiler::armEmitJmp;
     32 using CPU::Recompiler::armEmitMov;
     33 using CPU::Recompiler::armGetJumpTrampoline;
     34 using CPU::Recompiler::armGetPCDisplacement;
     35 using CPU::Recompiler::armIsCallerSavedRegister;
     36 using CPU::Recompiler::armMoveAddressToReg;
     37 
     38 AArch64Compiler s_instance;
     39 Compiler* g_compiler = &s_instance;
     40 
     41 } // namespace CPU::NewRec
     42 
     43 CPU::NewRec::AArch64Compiler::AArch64Compiler()
     44   : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode)
     45 {
     46 }
     47 
     48 CPU::NewRec::AArch64Compiler::~AArch64Compiler() = default;
     49 
     50 const void* CPU::NewRec::AArch64Compiler::GetCurrentCodePointer()
     51 {
     52   return armAsm->GetCursorAddress<const void*>();
     53 }
     54 
     55 void CPU::NewRec::AArch64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
     56                                          u8* far_code_buffer, u32 far_code_space)
     57 {
     58   Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
     59 
     60   // TODO: don't recreate this every time..
     61   DebugAssert(!armAsm);
     62   m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);
     63   m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);
     64   armAsm = &m_emitter;
     65 
     66 #ifdef VIXL_DEBUG
     67   m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(&m_emitter, code_buffer_space,
     68                                                                  vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
     69   m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(
     70     &m_far_emitter, far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
     71 #endif
     72 
     73   // Need to wipe it out so it's correct when toggling fastmem.
     74   m_host_regs = {};
     75 
     76   const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;
     77   for (u32 i = 0; i < NUM_HOST_REGS; i++)
     78   {
     79     HostRegAlloc& ra = m_host_regs[i];
     80 
     81     if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||
     82         i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i >= 30)
     83     {
     84       continue;
     85     }
     86 
     87     ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
     88   }
     89 }
     90 
     91 void CPU::NewRec::AArch64Compiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)
     92 {
     93   DebugAssert(armAsm == &m_emitter);
     94   if (emit_jump)
     95   {
     96     const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
     97     if (cond != Condition::al)
     98     {
     99       if (vixl::IsInt19(disp))
    100       {
    101         armAsm->b(disp, cond);
    102       }
    103       else
    104       {
    105         Label skip;
    106         armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));
    107         armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
    108         armAsm->bind(&skip);
    109       }
    110     }
    111     else
    112     {
    113       armAsm->b(disp);
    114     }
    115   }
    116   armAsm = &m_far_emitter;
    117 }
    118 
    119 void CPU::NewRec::AArch64Compiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)
    120 {
    121   const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
    122   if (vixl::IsInt14(disp))
    123   {
    124     armAsm->tbnz(reg, bit, disp);
    125   }
    126   else
    127   {
    128     Label skip;
    129     armAsm->tbz(reg, bit, &skip);
    130     armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
    131     armAsm->bind(&skip);
    132   }
    133 
    134   armAsm = &m_far_emitter;
    135 }
    136 
    137 void CPU::NewRec::AArch64Compiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)
    138 {
    139   const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
    140   if (vixl::IsInt19(disp))
    141   {
    142     nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);
    143   }
    144   else
    145   {
    146     Label skip;
    147     nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);
    148     armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
    149     armAsm->bind(&skip);
    150   }
    151 
    152   armAsm = &m_far_emitter;
    153 }
    154 
    155 void CPU::NewRec::AArch64Compiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)
    156 {
    157   DebugAssert(armAsm == &m_far_emitter);
    158   if (emit_jump)
    159   {
    160     const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());
    161     (cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);
    162   }
    163   armAsm = &m_emitter;
    164 }
    165 
    166 void CPU::NewRec::AArch64Compiler::EmitMov(const vixl::aarch64::Register& dst, u32 val)
    167 {
    168   armEmitMov(armAsm, dst, val);
    169 }
    170 
    171 void CPU::NewRec::AArch64Compiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
    172 {
    173   armEmitCall(armAsm, ptr, force_inline);
    174 }
    175 
    176 vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckAddSubConstant(s32 val)
    177 {
    178   if (Assembler::IsImmAddSub(val))
    179     return vixl::aarch64::Operand(static_cast<int64_t>(val));
    180 
    181   EmitMov(RWSCRATCH, static_cast<u32>(val));
    182   return vixl::aarch64::Operand(RWSCRATCH);
    183 }
    184 
    185 vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckAddSubConstant(u32 val)
    186 {
    187   return armCheckAddSubConstant(static_cast<s32>(val));
    188 }
    189 
    190 vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckCompareConstant(s32 val)
    191 {
    192   if (Assembler::IsImmConditionalCompare(val))
    193     return vixl::aarch64::Operand(static_cast<int64_t>(val));
    194 
    195   EmitMov(RWSCRATCH, static_cast<u32>(val));
    196   return vixl::aarch64::Operand(RWSCRATCH);
    197 }
    198 
    199 vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckLogicalConstant(u32 val)
    200 {
    201   if (Assembler::IsImmLogical(val, 32))
    202     return vixl::aarch64::Operand(static_cast<s64>(static_cast<u64>(val)));
    203 
    204   EmitMov(RWSCRATCH, val);
    205   return vixl::aarch64::Operand(RWSCRATCH);
    206 }
    207 
    208 void CPU::NewRec::AArch64Compiler::BeginBlock()
    209 {
    210   Compiler::BeginBlock();
    211 }
    212 
    213 void CPU::NewRec::AArch64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
    214 {
    215   // store it first to reduce code size, because we can offset
    216   armMoveAddressToReg(armAsm, RXARG1, ram_ptr);
    217   armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);
    218 
    219   bool first = true;
    220   u32 offset = 0;
    221   Label block_changed;
    222 
    223   while (size >= 16)
    224   {
    225     const VRegister vtmp = v2.V4S();
    226     const VRegister dst = first ? v0.V4S() : v1.V4S();
    227     armAsm->ldr(dst, MemOperand(RXARG1, offset));
    228     armAsm->ldr(vtmp, MemOperand(RXARG2, offset));
    229     armAsm->cmeq(dst, dst, vtmp);
    230     if (!first)
    231       armAsm->and_(v0.V16B(), v0.V16B(), dst.V16B());
    232     else
    233       first = false;
    234 
    235     offset += 16;
    236     size -= 16;
    237   }
    238 
    239   if (!first)
    240   {
    241     // TODO: make sure this doesn't choke on ffffffff
    242     armAsm->uminv(s0, v0.V4S());
    243     armAsm->fcmp(s0, 0.0);
    244     armAsm->b(&block_changed, eq);
    245   }
    246 
    247   while (size >= 8)
    248   {
    249     armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));
    250     armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));
    251     armAsm->cmp(RXARG3, RXSCRATCH);
    252     armAsm->b(&block_changed, ne);
    253     offset += 8;
    254     size -= 8;
    255   }
    256 
    257   while (size >= 4)
    258   {
    259     armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));
    260     armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));
    261     armAsm->cmp(RWARG3, RWSCRATCH);
    262     armAsm->b(&block_changed, ne);
    263     offset += 4;
    264     size -= 4;
    265   }
    266 
    267   DebugAssert(size == 0);
    268 
    269   Label block_unchanged;
    270   armAsm->b(&block_unchanged);
    271   armAsm->bind(&block_changed);
    272   armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
    273   armAsm->bind(&block_unchanged);
    274 }
    275 
    276 void CPU::NewRec::AArch64Compiler::GenerateICacheCheckAndUpdate()
    277 {
    278   if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
    279   {
    280     if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
    281     {
    282       armEmitFarLoad(armAsm, RWARG2, GetFetchMemoryAccessTimePtr());
    283       armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
    284       armEmitMov(armAsm, RWARG3, m_block->size);
    285       armAsm->mul(RWARG2, RWARG2, RWARG3);
    286       armAsm->add(RWARG1, RWARG1, RWARG2);
    287       armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
    288     }
    289     else
    290     {
    291       armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
    292       armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));
    293       armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
    294     }
    295   }
    296   else if (m_block->icache_line_count > 0)
    297   {
    298     const auto& ticks_reg = RWARG1;
    299     const auto& current_tag_reg = RWARG2;
    300     const auto& existing_tag_reg = RWARG3;
    301 
    302     VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
    303     armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
    304     armEmitMov(armAsm, current_tag_reg, current_pc);
    305 
    306     for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
    307     {
    308       const TickCount fill_ticks = GetICacheFillTicks(current_pc);
    309       if (fill_ticks <= 0)
    310         continue;
    311 
    312       const u32 line = GetICacheLine(current_pc);
    313       const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
    314 
    315       Label cache_hit;
    316       armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
    317       armAsm->cmp(existing_tag_reg, current_tag_reg);
    318       armAsm->b(&cache_hit, eq);
    319 
    320       armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
    321       armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast<u32>(fill_ticks)));
    322       armAsm->bind(&cache_hit);
    323 
    324       if (i != (m_block->icache_line_count - 1))
    325         armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
    326     }
    327 
    328     armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
    329   }
    330 }
    331 
    332 void CPU::NewRec::AArch64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
    333                                                 s32 arg3reg /*= -1*/)
    334 {
    335   if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.GetCode()))
    336     armAsm->mov(RXARG1, XRegister(arg1reg));
    337   if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.GetCode()))
    338     armAsm->mov(RXARG2, XRegister(arg2reg));
    339   if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.GetCode()))
    340     armAsm->mov(RXARG3, XRegister(arg3reg));
    341   EmitCall(func);
    342 }
    343 
    344 void CPU::NewRec::AArch64Compiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
    345 {
    346   if (newpc.has_value())
    347   {
    348     if (m_dirty_pc || m_compiler_pc != newpc)
    349     {
    350       EmitMov(RWSCRATCH, newpc.value());
    351       armAsm->str(RWSCRATCH, PTR(&g_state.pc));
    352     }
    353   }
    354   m_dirty_pc = false;
    355 
    356   // flush regs
    357   Flush(FLUSH_END_BLOCK);
    358   EndAndLinkBlock(newpc, do_event_test, false);
    359 }
    360 
    361 void CPU::NewRec::AArch64Compiler::EndBlockWithException(Exception excode)
    362 {
    363   // flush regs, but not pc, it's going to get overwritten
    364   // flush cycles because of the GTE instruction stuff...
    365   Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
    366 
    367   // TODO: flush load delay
    368   // TODO: break for pcdrv
    369 
    370   EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
    371                                                               inst->cop.cop_n));
    372   EmitMov(RWARG2, m_current_instruction_pc);
    373   EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
    374   m_dirty_pc = false;
    375 
    376   EndAndLinkBlock(std::nullopt, true, false);
    377 }
    378 
    379 void CPU::NewRec::AArch64Compiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test,
    380                                                    bool force_run_events)
    381 {
    382   // event test
    383   // pc should've been flushed
    384   DebugAssert(!m_dirty_pc && !m_block_ended);
    385   m_block_ended = true;
    386 
    387   // TODO: try extracting this to a function
    388 
    389   // save cycles for event test
    390   const TickCount cycles = std::exchange(m_cycles, 0);
    391 
    392   // pending_ticks += cycles
    393   // if (pending_ticks >= downcount) { dispatch_event(); }
    394   if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
    395     armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
    396   if (do_event_test)
    397     armAsm->ldr(RWARG2, PTR(&g_state.downcount));
    398   if (cycles > 0)
    399     armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));
    400   if (m_gte_done_cycle > cycles)
    401   {
    402     armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
    403     armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));
    404   }
    405   if (do_event_test)
    406     armAsm->cmp(RWARG1, RWARG2);
    407   if (cycles > 0)
    408     armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
    409   if (do_event_test)
    410     armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
    411 
    412   // jump to dispatcher or next block
    413   if (force_run_events)
    414   {
    415     armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);
    416   }
    417   else if (!newpc.has_value())
    418   {
    419     armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
    420   }
    421   else
    422   {
    423     if (newpc.value() == m_block->pc)
    424     {
    425       // Special case: ourselves! No need to backlink then.
    426       DEBUG_LOG("Linking block at {:08X} to self", m_block->pc);
    427       armEmitJmp(armAsm, armAsm->GetBuffer()->GetStartAddress<const void*>(), true);
    428     }
    429     else
    430     {
    431       const void* target = CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());
    432       armEmitJmp(armAsm, target, true);
    433     }
    434   }
    435 }
    436 
    437 const void* CPU::NewRec::AArch64Compiler::EndCompile(u32* code_size, u32* far_code_size)
    438 {
    439 #ifdef VIXL_DEBUG
    440   m_emitter_check.reset();
    441   m_far_emitter_check.reset();
    442 #endif
    443 
    444   m_emitter.FinalizeCode();
    445   m_far_emitter.FinalizeCode();
    446 
    447   u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();
    448   *code_size = static_cast<u32>(m_emitter.GetCursorOffset());
    449   *far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());
    450   armAsm = nullptr;
    451   return code;
    452 }
    453 
    454 const char* CPU::NewRec::AArch64Compiler::GetHostRegName(u32 reg) const
    455 {
    456   static constexpr std::array<const char*, 32> reg64_names = {
    457     {"x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",  "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
    458      "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp",  "lr",  "sp"}};
    459   return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
    460 }
    461 
    462 void CPU::NewRec::AArch64Compiler::LoadHostRegWithConstant(u32 reg, u32 val)
    463 {
    464   EmitMov(WRegister(reg), val);
    465 }
    466 
    467 void CPU::NewRec::AArch64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
    468 {
    469   armAsm->ldr(WRegister(reg), PTR(ptr));
    470 }
    471 
    472 void CPU::NewRec::AArch64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
    473 {
    474   armAsm->str(WRegister(reg), PTR(ptr));
    475 }
    476 
    477 void CPU::NewRec::AArch64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
    478 {
    479   if (val == 0)
    480   {
    481     armAsm->str(wzr, PTR(ptr));
    482     return;
    483   }
    484 
    485   EmitMov(RWSCRATCH, val);
    486   armAsm->str(RWSCRATCH, PTR(ptr));
    487 }
    488 
    489 void CPU::NewRec::AArch64Compiler::CopyHostReg(u32 dst, u32 src)
    490 {
    491   if (src != dst)
    492     armAsm->mov(WRegister(dst), WRegister(src));
    493 }
    494 
    495 void CPU::NewRec::AArch64Compiler::AssertRegOrConstS(CompileFlags cf) const
    496 {
    497   DebugAssert(cf.valid_host_s || cf.const_s);
    498 }
    499 
    500 void CPU::NewRec::AArch64Compiler::AssertRegOrConstT(CompileFlags cf) const
    501 {
    502   DebugAssert(cf.valid_host_t || cf.const_t);
    503 }
    504 
    505 vixl::aarch64::MemOperand CPU::NewRec::AArch64Compiler::MipsPtr(Reg r) const
    506 {
    507   DebugAssert(r < Reg::count);
    508   return PTR(&g_state.regs.r[static_cast<u32>(r)]);
    509 }
    510 
    511 vixl::aarch64::Register CPU::NewRec::AArch64Compiler::CFGetRegD(CompileFlags cf) const
    512 {
    513   DebugAssert(cf.valid_host_d);
    514   return WRegister(cf.host_d);
    515 }
    516 
    517 vixl::aarch64::Register CPU::NewRec::AArch64Compiler::CFGetRegS(CompileFlags cf) const
    518 {
    519   DebugAssert(cf.valid_host_s);
    520   return WRegister(cf.host_s);
    521 }
    522 
    523 vixl::aarch64::Register CPU::NewRec::AArch64Compiler::CFGetRegT(CompileFlags cf) const
    524 {
    525   DebugAssert(cf.valid_host_t);
    526   return WRegister(cf.host_t);
    527 }
    528 
    529 vixl::aarch64::Register CPU::NewRec::AArch64Compiler::CFGetRegLO(CompileFlags cf) const
    530 {
    531   DebugAssert(cf.valid_host_lo);
    532   return WRegister(cf.host_lo);
    533 }
    534 
    535 vixl::aarch64::Register CPU::NewRec::AArch64Compiler::CFGetRegHI(CompileFlags cf) const
    536 {
    537   DebugAssert(cf.valid_host_hi);
    538   return WRegister(cf.host_hi);
    539 }
    540 
    541 void CPU::NewRec::AArch64Compiler::MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf)
    542 {
    543   DebugAssert(dst.IsW());
    544   if (cf.valid_host_s)
    545   {
    546     if (cf.host_s != dst.GetCode())
    547       armAsm->mov(dst, WRegister(cf.host_s));
    548   }
    549   else if (cf.const_s)
    550   {
    551     const u32 cv = GetConstantRegU32(cf.MipsS());
    552     if (cv == 0)
    553       armAsm->mov(dst, wzr);
    554     else
    555       EmitMov(dst, cv);
    556   }
    557   else
    558   {
    559     WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
    560     armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
    561   }
    562 }
    563 
    564 void CPU::NewRec::AArch64Compiler::MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf)
    565 {
    566   DebugAssert(dst.IsW());
    567   if (cf.valid_host_t)
    568   {
    569     if (cf.host_t != dst.GetCode())
    570       armAsm->mov(dst, WRegister(cf.host_t));
    571   }
    572   else if (cf.const_t)
    573   {
    574     const u32 cv = GetConstantRegU32(cf.MipsT());
    575     if (cv == 0)
    576       armAsm->mov(dst, wzr);
    577     else
    578       EmitMov(dst, cv);
    579   }
    580   else
    581   {
    582     WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
    583     armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
    584   }
    585 }
    586 
    587 void CPU::NewRec::AArch64Compiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg)
    588 {
    589   DebugAssert(reg < Reg::count && dst.IsW());
    590   if (const std::optional<u32> hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg))
    591     armAsm->mov(dst, WRegister(hreg.value()));
    592   else if (HasConstantReg(reg))
    593     EmitMov(dst, GetConstantRegU32(reg));
    594   else
    595     armAsm->ldr(dst, MipsPtr(reg));
    596 }
    597 
    598 void CPU::NewRec::AArch64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
    599                                                                 Reg arg2reg /* = Reg::count */,
    600                                                                 Reg arg3reg /* = Reg::count */)
    601 {
    602   DebugAssert(g_settings.gpu_pgxp_enable);
    603 
    604   Flush(FLUSH_FOR_C_CALL);
    605 
    606   if (arg2reg != Reg::count)
    607     MoveMIPSRegToReg(RWARG2, arg2reg);
    608   if (arg3reg != Reg::count)
    609     MoveMIPSRegToReg(RWARG3, arg3reg);
    610 
    611   EmitMov(RWARG1, arg1val);
    612   EmitCall(func);
    613 }
    614 
    615 void CPU::NewRec::AArch64Compiler::Flush(u32 flags)
    616 {
    617   Compiler::Flush(flags);
    618 
    619   if (flags & FLUSH_PC && m_dirty_pc)
    620   {
    621     StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
    622     m_dirty_pc = false;
    623   }
    624 
    625   if (flags & FLUSH_INSTRUCTION_BITS)
    626   {
    627     // This sucks, but it's only used for fallbacks.
    628     EmitMov(RWARG1, inst->bits);
    629     EmitMov(RWARG2, m_current_instruction_pc);
    630     EmitMov(RWARG3, m_current_instruction_branch_delay_slot);
    631     armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits));
    632     armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc));
    633     armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));
    634   }
    635 
    636   if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
    637   {
    638     // This sucks :(
    639     // TODO: make it a function?
    640     armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));
    641     armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));
    642     EmitMov(RWSCRATCH, OFFSETOF(CPU::State, regs.r[0]));
    643     armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));
    644     armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));
    645     EmitMov(RWSCRATCH, static_cast<u8>(Reg::count));
    646     armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
    647     m_load_delay_dirty = false;
    648   }
    649 
    650   if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
    651   {
    652     if (m_load_delay_value_register != NUM_HOST_REGS)
    653       FreeHostReg(m_load_delay_value_register);
    654 
    655     EmitMov(RWSCRATCH, static_cast<u8>(m_load_delay_register));
    656     armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
    657     m_load_delay_register = Reg::count;
    658     m_load_delay_dirty = true;
    659   }
    660 
    661   if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
    662   {
    663     // May as well flush cycles while we're here.
    664     // GTE spanning blocks is very rare, we _could_ disable this for speed.
    665     armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
    666     armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));
    667     if (m_cycles > 0)
    668     {
    669       armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
    670       m_cycles = 0;
    671     }
    672     armAsm->cmp(RWARG2, RWARG1);
    673     armAsm->csel(RWARG1, RWARG2, RWARG1, hs);
    674     armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
    675     m_dirty_gte_done_cycle = false;
    676   }
    677 
    678   if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
    679   {
    680     armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
    681 
    682     // update cycles at the same time
    683     if (flags & FLUSH_CYCLES && m_cycles > 0)
    684     {
    685       armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
    686       armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
    687       m_gte_done_cycle -= m_cycles;
    688       m_cycles = 0;
    689     }
    690 
    691     armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));
    692     armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));
    693     m_gte_done_cycle = 0;
    694     m_dirty_gte_done_cycle = true;
    695   }
    696 
    697   if (flags & FLUSH_CYCLES && m_cycles > 0)
    698   {
    699     armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
    700     armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
    701     armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
    702     m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
    703     m_cycles = 0;
    704   }
    705 }
    706 
    707 void CPU::NewRec::AArch64Compiler::Compile_Fallback()
    708 {
    709   WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", iinfo->pc, inst->bits);
    710 
    711   Flush(FLUSH_FOR_INTERPRETER);
    712 
    713   EmitCall(reinterpret_cast<const void*>(&CPU::Recompiler::Thunks::InterpretInstruction));
    714 
    715   // TODO: make me less garbage
    716   // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
    717   // but nothing should be going through here..
    718   Label no_load_delay;
    719   armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg));
    720   armAsm->cmp(RWARG1, static_cast<u8>(Reg::count));
    721   armAsm->b(&no_load_delay, eq);
    722   armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value));
    723   armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg));
    724   armAsm->str(RWARG2, PTR(&g_state.load_delay_value));
    725   EmitMov(RWARG1, static_cast<u32>(Reg::count));
    726   armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg));
    727   armAsm->bind(&no_load_delay);
    728 
    729   m_load_delay_dirty = EMULATE_LOAD_DELAYS;
    730 }
    731 
    732 void CPU::NewRec::AArch64Compiler::CheckBranchTarget(const vixl::aarch64::Register& pcreg)
    733 {
    734   DebugAssert(pcreg.IsW());
    735   if (!g_settings.cpu_recompiler_memory_exceptions)
    736     return;
    737 
    738   armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
    739   SwitchToFarCode(true, ne);
    740 
    741   BackupHostState();
    742   EndBlockWithException(Exception::AdEL);
    743 
    744   RestoreHostState();
    745   SwitchToNearCode(false);
    746 }
    747 
    748 void CPU::NewRec::AArch64Compiler::Compile_jr(CompileFlags cf)
    749 {
    750   const Register pcreg = CFGetRegS(cf);
    751   CheckBranchTarget(pcreg);
    752 
    753   armAsm->str(pcreg, PTR(&g_state.pc));
    754 
    755   CompileBranchDelaySlot(false);
    756   EndBlock(std::nullopt, true);
    757 }
    758 
    759 void CPU::NewRec::AArch64Compiler::Compile_jalr(CompileFlags cf)
    760 {
    761   const Register pcreg = CFGetRegS(cf);
    762   if (MipsD() != Reg::zero)
    763     SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
    764 
    765   CheckBranchTarget(pcreg);
    766   armAsm->str(pcreg, PTR(&g_state.pc));
    767 
    768   CompileBranchDelaySlot(false);
    769   EndBlock(std::nullopt, true);
    770 }
    771 
    772 void CPU::NewRec::AArch64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
    773 {
    774   AssertRegOrConstS(cf);
    775 
    776   const u32 taken_pc = GetConditionalBranchTarget(cf);
    777 
    778   Flush(FLUSH_FOR_BRANCH);
    779 
    780   DebugAssert(cf.valid_host_s);
    781 
    782   // MipsT() here should equal zero for zero branches.
    783   DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
    784 
    785   Label taken;
    786   const Register rs = CFGetRegS(cf);
    787   switch (cond)
    788   {
    789     case BranchCondition::Equal:
    790     case BranchCondition::NotEqual:
    791     {
    792       AssertRegOrConstT(cf);
    793       if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
    794       {
    795         (cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);
    796       }
    797       else
    798       {
    799         if (cf.valid_host_t)
    800           armAsm->cmp(rs, CFGetRegT(cf));
    801         else if (cf.const_t)
    802           armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
    803 
    804         armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);
    805       }
    806     }
    807     break;
    808 
    809     case BranchCondition::GreaterThanZero:
    810     {
    811       armAsm->cmp(rs, 0);
    812       armAsm->b(&taken, gt);
    813     }
    814     break;
    815 
    816     case BranchCondition::GreaterEqualZero:
    817     {
    818       armAsm->cmp(rs, 0);
    819       armAsm->b(&taken, ge);
    820     }
    821     break;
    822 
    823     case BranchCondition::LessThanZero:
    824     {
    825       armAsm->cmp(rs, 0);
    826       armAsm->b(&taken, lt);
    827     }
    828     break;
    829 
    830     case BranchCondition::LessEqualZero:
    831     {
    832       armAsm->cmp(rs, 0);
    833       armAsm->b(&taken, le);
    834     }
    835     break;
    836   }
    837 
    838   BackupHostState();
    839   if (!cf.delay_slot_swapped)
    840     CompileBranchDelaySlot();
    841 
    842   EndBlock(m_compiler_pc, true);
    843 
    844   armAsm->bind(&taken);
    845 
    846   RestoreHostState();
    847   if (!cf.delay_slot_swapped)
    848     CompileBranchDelaySlot();
    849 
    850   EndBlock(taken_pc, true);
    851 }
    852 
    853 void CPU::NewRec::AArch64Compiler::Compile_addi(CompileFlags cf, bool overflow)
    854 {
    855   const Register rs = CFGetRegS(cf);
    856   const Register rt = CFGetRegT(cf);
    857   if (const u32 imm = inst->i.imm_sext32(); imm != 0)
    858   {
    859     if (!overflow)
    860     {
    861       armAsm->add(rt, rs, armCheckAddSubConstant(imm));
    862     }
    863     else
    864     {
    865       armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
    866       TestOverflow(rt);
    867     }
    868   }
    869   else if (rt.GetCode() != rs.GetCode())
    870   {
    871     armAsm->mov(rt, rs);
    872   }
    873 }
    874 
    875 void CPU::NewRec::AArch64Compiler::Compile_addi(CompileFlags cf)
    876 {
    877   Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
    878 }
    879 
    880 void CPU::NewRec::AArch64Compiler::Compile_addiu(CompileFlags cf)
    881 {
    882   Compile_addi(cf, false);
    883 }
    884 
    885 void CPU::NewRec::AArch64Compiler::Compile_slti(CompileFlags cf)
    886 {
    887   Compile_slti(cf, true);
    888 }
    889 
    890 void CPU::NewRec::AArch64Compiler::Compile_sltiu(CompileFlags cf)
    891 {
    892   Compile_slti(cf, false);
    893 }
    894 
    895 void CPU::NewRec::AArch64Compiler::Compile_slti(CompileFlags cf, bool sign)
    896 {
    897   armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));
    898   armAsm->cset(CFGetRegT(cf), sign ? lt : lo);
    899 }
    900 
    901 void CPU::NewRec::AArch64Compiler::Compile_andi(CompileFlags cf)
    902 {
    903   const Register rt = CFGetRegT(cf);
    904   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    905     armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
    906   else
    907     armAsm->mov(rt, wzr);
    908 }
    909 
    910 void CPU::NewRec::AArch64Compiler::Compile_ori(CompileFlags cf)
    911 {
    912   const Register rt = CFGetRegT(cf);
    913   const Register rs = CFGetRegS(cf);
    914   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    915     armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
    916   else if (rt.GetCode() != rs.GetCode())
    917     armAsm->mov(rt, rs);
    918 }
    919 
    920 void CPU::NewRec::AArch64Compiler::Compile_xori(CompileFlags cf)
    921 {
    922   const Register rt = CFGetRegT(cf);
    923   const Register rs = CFGetRegS(cf);
    924   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    925     armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
    926   else if (rt.GetCode() != rs.GetCode())
    927     armAsm->mov(rt, rs);
    928 }
    929 
    930 void CPU::NewRec::AArch64Compiler::Compile_shift(CompileFlags cf,
    931                                                  void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
    932                                                                                       const vixl::aarch64::Register&,
    933                                                                                       unsigned))
    934 {
    935   const Register rd = CFGetRegD(cf);
    936   const Register rt = CFGetRegT(cf);
    937   if (inst->r.shamt > 0)
    938     (armAsm->*op)(rd, rt, inst->r.shamt);
    939   else if (rd.GetCode() != rt.GetCode())
    940     armAsm->mov(rd, rt);
    941 }
    942 
    943 void CPU::NewRec::AArch64Compiler::Compile_sll(CompileFlags cf)
    944 {
    945   Compile_shift(cf, &Assembler::lsl);
    946 }
    947 
    948 void CPU::NewRec::AArch64Compiler::Compile_srl(CompileFlags cf)
    949 {
    950   Compile_shift(cf, &Assembler::lsr);
    951 }
    952 
    953 void CPU::NewRec::AArch64Compiler::Compile_sra(CompileFlags cf)
    954 {
    955   Compile_shift(cf, &Assembler::asr);
    956 }
    957 
    958 void CPU::NewRec::AArch64Compiler::Compile_variable_shift(
    959   CompileFlags cf,
    960   void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,
    961                                        const vixl::aarch64::Register&),
    962   void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))
    963 {
    964   const Register rd = CFGetRegD(cf);
    965 
    966   AssertRegOrConstS(cf);
    967   AssertRegOrConstT(cf);
    968 
    969   const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
    970   if (!cf.valid_host_t)
    971     MoveTToReg(rt, cf);
    972 
    973   if (cf.const_s)
    974   {
    975     if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
    976       (armAsm->*op_const)(rd, rt, shift);
    977     else if (rd.GetCode() != rt.GetCode())
    978       armAsm->mov(rd, rt);
    979   }
    980   else
    981   {
    982     (armAsm->*op)(rd, rt, CFGetRegS(cf));
    983   }
    984 }
    985 
    986 void CPU::NewRec::AArch64Compiler::Compile_sllv(CompileFlags cf)
    987 {
    988   Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);
    989 }
    990 
    991 void CPU::NewRec::AArch64Compiler::Compile_srlv(CompileFlags cf)
    992 {
    993   Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);
    994 }
    995 
    996 void CPU::NewRec::AArch64Compiler::Compile_srav(CompileFlags cf)
    997 {
    998   Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);
    999 }
   1000 
   1001 void CPU::NewRec::AArch64Compiler::Compile_mult(CompileFlags cf, bool sign)
   1002 {
   1003   const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
   1004   if (!cf.valid_host_s)
   1005     MoveSToReg(rs, cf);
   1006 
   1007   const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
   1008   if (!cf.valid_host_t)
   1009     MoveTToReg(rt, cf);
   1010 
   1011   // TODO: if lo/hi gets killed, we can use a 32-bit multiply
   1012   const Register lo = CFGetRegLO(cf);
   1013   const Register hi = CFGetRegHI(cf);
   1014 
   1015   (sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);
   1016   armAsm->lsr(hi.X(), lo.X(), 32);
   1017 }
   1018 
   1019 void CPU::NewRec::AArch64Compiler::Compile_mult(CompileFlags cf)
   1020 {
   1021   Compile_mult(cf, true);
   1022 }
   1023 
   1024 void CPU::NewRec::AArch64Compiler::Compile_multu(CompileFlags cf)
   1025 {
   1026   Compile_mult(cf, false);
   1027 }
   1028 
   1029 void CPU::NewRec::AArch64Compiler::Compile_div(CompileFlags cf)
   1030 {
   1031   const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
   1032   if (!cf.valid_host_s)
   1033     MoveSToReg(rs, cf);
   1034 
   1035   const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
   1036   if (!cf.valid_host_t)
   1037     MoveTToReg(rt, cf);
   1038 
   1039   const Register rlo = CFGetRegLO(cf);
   1040   const Register rhi = CFGetRegHI(cf);
   1041 
   1042   // TODO: This could be slightly more optimal
   1043   Label done;
   1044   Label not_divide_by_zero;
   1045   armAsm->cbnz(rt, &not_divide_by_zero);
   1046   armAsm->mov(rhi, rs); // hi = num
   1047   EmitMov(rlo, 1);
   1048   EmitMov(RWSCRATCH, static_cast<u32>(-1));
   1049   armAsm->cmp(rs, 0);
   1050   armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 1
   1051   armAsm->b(&done);
   1052 
   1053   armAsm->bind(&not_divide_by_zero);
   1054   Label not_unrepresentable;
   1055   armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));
   1056   armAsm->b(&not_unrepresentable, ne);
   1057   armAsm->cmp(rt, armCheckCompareConstant(-1));
   1058   armAsm->b(&not_unrepresentable, ne);
   1059 
   1060   EmitMov(rlo, 0x80000000u);
   1061   EmitMov(rhi, 0);
   1062   armAsm->b(&done);
   1063 
   1064   armAsm->bind(&not_unrepresentable);
   1065 
   1066   armAsm->sdiv(rlo, rs, rt);
   1067 
   1068   // TODO: skip when hi is dead
   1069   armAsm->msub(rhi, rlo, rt, rs);
   1070 
   1071   armAsm->bind(&done);
   1072 }
   1073 
   1074 void CPU::NewRec::AArch64Compiler::Compile_divu(CompileFlags cf)
   1075 {
   1076   const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
   1077   if (!cf.valid_host_s)
   1078     MoveSToReg(rs, cf);
   1079 
   1080   const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
   1081   if (!cf.valid_host_t)
   1082     MoveTToReg(rt, cf);
   1083 
   1084   const Register rlo = CFGetRegLO(cf);
   1085   const Register rhi = CFGetRegHI(cf);
   1086 
   1087   Label done;
   1088   Label not_divide_by_zero;
   1089   armAsm->cbnz(rt, &not_divide_by_zero);
   1090   EmitMov(rlo, static_cast<u32>(-1));
   1091   armAsm->mov(rhi, rs);
   1092   armAsm->b(&done);
   1093 
   1094   armAsm->bind(&not_divide_by_zero);
   1095 
   1096   armAsm->udiv(rlo, rs, rt);
   1097 
   1098   // TODO: skip when hi is dead
   1099   armAsm->msub(rhi, rlo, rt, rs);
   1100 
   1101   armAsm->bind(&done);
   1102 }
   1103 
   1104 void CPU::NewRec::AArch64Compiler::TestOverflow(const vixl::aarch64::Register& result)
   1105 {
   1106   DebugAssert(result.IsW());
   1107   SwitchToFarCode(true, vs);
   1108 
   1109   BackupHostState();
   1110 
   1111   // toss the result
   1112   ClearHostReg(result.GetCode());
   1113 
   1114   EndBlockWithException(Exception::Ov);
   1115 
   1116   RestoreHostState();
   1117 
   1118   SwitchToNearCode(false);
   1119 }
   1120 
   1121 void CPU::NewRec::AArch64Compiler::Compile_dst_op(CompileFlags cf,
   1122                                                   void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
   1123                                                                                        const vixl::aarch64::Register&,
   1124                                                                                        const vixl::aarch64::Operand&),
   1125                                                   bool commutative, bool logical, bool overflow)
   1126 {
   1127   AssertRegOrConstS(cf);
   1128   AssertRegOrConstT(cf);
   1129 
   1130   const Register rd = CFGetRegD(cf);
   1131   if (cf.valid_host_s && cf.valid_host_t)
   1132   {
   1133     (armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
   1134   }
   1135   else if (commutative && (cf.const_s || cf.const_t))
   1136   {
   1137     const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
   1138     if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
   1139     {
   1140       (armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
   1141     }
   1142     else
   1143     {
   1144       if (rd.GetCode() != src.GetCode())
   1145         armAsm->mov(rd, src);
   1146       overflow = false;
   1147     }
   1148   }
   1149   else if (cf.const_s)
   1150   {
   1151     // TODO: Check where we can use wzr here
   1152     EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));
   1153     (armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));
   1154   }
   1155   else if (cf.const_t)
   1156   {
   1157     const Register rs = CFGetRegS(cf);
   1158     if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
   1159     {
   1160       (armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
   1161     }
   1162     else
   1163     {
   1164       if (rd.GetCode() != rs.GetCode())
   1165         armAsm->mov(rd, rs);
   1166       overflow = false;
   1167     }
   1168   }
   1169 
   1170   if (overflow)
   1171     TestOverflow(rd);
   1172 }
   1173 
   1174 void CPU::NewRec::AArch64Compiler::Compile_add(CompileFlags cf)
   1175 {
   1176   if (g_settings.cpu_recompiler_memory_exceptions)
   1177     Compile_dst_op(cf, &Assembler::adds, true, false, true);
   1178   else
   1179     Compile_dst_op(cf, &Assembler::add, true, false, false);
   1180 }
   1181 
   1182 void CPU::NewRec::AArch64Compiler::Compile_addu(CompileFlags cf)
   1183 {
   1184   Compile_dst_op(cf, &Assembler::add, true, false, false);
   1185 }
   1186 
   1187 void CPU::NewRec::AArch64Compiler::Compile_sub(CompileFlags cf)
   1188 {
   1189   if (g_settings.cpu_recompiler_memory_exceptions)
   1190     Compile_dst_op(cf, &Assembler::subs, false, false, true);
   1191   else
   1192     Compile_dst_op(cf, &Assembler::sub, false, false, false);
   1193 }
   1194 
   1195 void CPU::NewRec::AArch64Compiler::Compile_subu(CompileFlags cf)
   1196 {
   1197   Compile_dst_op(cf, &Assembler::sub, false, false, false);
   1198 }
   1199 
   1200 void CPU::NewRec::AArch64Compiler::Compile_and(CompileFlags cf)
   1201 {
   1202   AssertRegOrConstS(cf);
   1203   AssertRegOrConstT(cf);
   1204 
   1205   // special cases - and with self -> self, and with 0 -> 0
   1206   const Register regd = CFGetRegD(cf);
   1207   if (cf.MipsS() == cf.MipsT())
   1208   {
   1209     armAsm->mov(regd, CFGetRegS(cf));
   1210     return;
   1211   }
   1212   else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
   1213   {
   1214     armAsm->mov(regd, wzr);
   1215     return;
   1216   }
   1217 
   1218   Compile_dst_op(cf, &Assembler::and_, true, true, false);
   1219 }
   1220 
   1221 void CPU::NewRec::AArch64Compiler::Compile_or(CompileFlags cf)
   1222 {
   1223   AssertRegOrConstS(cf);
   1224   AssertRegOrConstT(cf);
   1225 
   1226   // or/nor with 0 -> no effect
   1227   const Register regd = CFGetRegD(cf);
   1228   if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
   1229   {
   1230     cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
   1231     return;
   1232   }
   1233 
   1234   Compile_dst_op(cf, &Assembler::orr, true, true, false);
   1235 }
   1236 
   1237 void CPU::NewRec::AArch64Compiler::Compile_xor(CompileFlags cf)
   1238 {
   1239   AssertRegOrConstS(cf);
   1240   AssertRegOrConstT(cf);
   1241 
   1242   const Register regd = CFGetRegD(cf);
   1243   if (cf.MipsS() == cf.MipsT())
   1244   {
   1245     // xor with self -> zero
   1246     armAsm->mov(regd, wzr);
   1247     return;
   1248   }
   1249   else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
   1250   {
   1251     // xor with zero -> no effect
   1252     cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
   1253     return;
   1254   }
   1255 
   1256   Compile_dst_op(cf, &Assembler::eor, true, true, false);
   1257 }
   1258 
   1259 void CPU::NewRec::AArch64Compiler::Compile_nor(CompileFlags cf)
   1260 {
   1261   Compile_or(cf);
   1262   armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
   1263 }
   1264 
   1265 void CPU::NewRec::AArch64Compiler::Compile_slt(CompileFlags cf)
   1266 {
   1267   Compile_slt(cf, true);
   1268 }
   1269 
   1270 void CPU::NewRec::AArch64Compiler::Compile_sltu(CompileFlags cf)
   1271 {
   1272   Compile_slt(cf, false);
   1273 }
   1274 
   1275 void CPU::NewRec::AArch64Compiler::Compile_slt(CompileFlags cf, bool sign)
   1276 {
   1277   AssertRegOrConstS(cf);
   1278   AssertRegOrConstT(cf);
   1279 
   1280   // TODO: swap and reverse op for constants
   1281   if (cf.const_s)
   1282   {
   1283     EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));
   1284     armAsm->cmp(RWSCRATCH, CFGetRegT(cf));
   1285   }
   1286   else if (cf.const_t)
   1287   {
   1288     armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
   1289   }
   1290   else
   1291   {
   1292     armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
   1293   }
   1294 
   1295   armAsm->cset(CFGetRegD(cf), sign ? lt : lo);
   1296 }
   1297 
   1298 vixl::aarch64::Register
   1299 CPU::NewRec::AArch64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf,
   1300                                                          const std::optional<VirtualMemoryAddress>& address,
   1301                                                          const std::optional<const vixl::aarch64::Register>& reg)
   1302 {
   1303   const u32 imm = inst->i.imm_sext32();
   1304   if (cf.valid_host_s && imm == 0 && !reg.has_value())
   1305     return CFGetRegS(cf);
   1306 
   1307   const Register dst = reg.has_value() ? reg.value() : RWARG1;
   1308   if (address.has_value())
   1309   {
   1310     EmitMov(dst, address.value());
   1311   }
   1312   else if (imm == 0)
   1313   {
   1314     if (cf.valid_host_s)
   1315     {
   1316       if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
   1317         armAsm->mov(dst, CFGetRegS(cf));
   1318     }
   1319     else
   1320     {
   1321       armAsm->ldr(dst, MipsPtr(cf.MipsS()));
   1322     }
   1323   }
   1324   else
   1325   {
   1326     if (cf.valid_host_s)
   1327     {
   1328       armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
   1329     }
   1330     else
   1331     {
   1332       armAsm->ldr(dst, MipsPtr(cf.MipsS()));
   1333       armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
   1334     }
   1335   }
   1336 
   1337   return dst;
   1338 }
   1339 
   1340 template<typename RegAllocFn>
   1341 vixl::aarch64::Register CPU::NewRec::AArch64Compiler::GenerateLoad(const vixl::aarch64::Register& addr_reg,
   1342                                                                    MemoryAccessSize size, bool sign, bool use_fastmem,
   1343                                                                    const RegAllocFn& dst_reg_alloc)
   1344 {
   1345   DebugAssert(addr_reg.IsW());
   1346   if (use_fastmem)
   1347   {
   1348     m_cycles += Bus::RAM_READ_TICKS;
   1349 
   1350     const Register dst = dst_reg_alloc();
   1351 
   1352     if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
   1353     {
   1354       DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
   1355       armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
   1356       armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));
   1357     }
   1358 
   1359     const MemOperand mem =
   1360       MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
   1361     u8* start = armAsm->GetCursorAddress<u8*>();
   1362     switch (size)
   1363     {
   1364       case MemoryAccessSize::Byte:
   1365         sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
   1366         break;
   1367 
   1368       case MemoryAccessSize::HalfWord:
   1369         sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
   1370         break;
   1371 
   1372       case MemoryAccessSize::Word:
   1373         armAsm->ldr(dst, mem);
   1374         break;
   1375     }
   1376 
   1377     AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
   1378     return dst;
   1379   }
   1380 
   1381   if (addr_reg.GetCode() != RWARG1.GetCode())
   1382     armAsm->mov(RWARG1, addr_reg);
   1383 
   1384   const bool checked = g_settings.cpu_recompiler_memory_exceptions;
   1385   switch (size)
   1386   {
   1387     case MemoryAccessSize::Byte:
   1388     {
   1389       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryByte) :
   1390                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte));
   1391     }
   1392     break;
   1393     case MemoryAccessSize::HalfWord:
   1394     {
   1395       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryHalfWord) :
   1396                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord));
   1397     }
   1398     break;
   1399     case MemoryAccessSize::Word:
   1400     {
   1401       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryWord) :
   1402                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord));
   1403     }
   1404     break;
   1405   }
   1406 
   1407   // TODO: turn this into an asm function instead
   1408   if (checked)
   1409   {
   1410     SwitchToFarCodeIfBitSet(RXRET, 63);
   1411     BackupHostState();
   1412 
   1413     // Need to stash this in a temp because of the flush.
   1414     const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
   1415     armAsm->neg(temp.X(), RXRET);
   1416     armAsm->lsl(temp, temp, 2);
   1417 
   1418     Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
   1419 
   1420     // cause_bits = (-result << 2) | BD | cop_n
   1421     armAsm->orr(RWARG1, temp,
   1422                 armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
   1423                   static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
   1424     EmitMov(RWARG2, m_current_instruction_pc);
   1425     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   1426     FreeHostReg(temp.GetCode());
   1427     EndBlock(std::nullopt, true);
   1428 
   1429     RestoreHostState();
   1430     SwitchToNearCode(false);
   1431   }
   1432 
   1433   const Register dst_reg = dst_reg_alloc();
   1434   switch (size)
   1435   {
   1436     case MemoryAccessSize::Byte:
   1437     {
   1438       sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);
   1439     }
   1440     break;
   1441     case MemoryAccessSize::HalfWord:
   1442     {
   1443       sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);
   1444     }
   1445     break;
   1446     case MemoryAccessSize::Word:
   1447     {
   1448       if (dst_reg.GetCode() != RWRET.GetCode())
   1449         armAsm->mov(dst_reg, RWRET);
   1450     }
   1451     break;
   1452   }
   1453 
   1454   return dst_reg;
   1455 }
   1456 
   1457 void CPU::NewRec::AArch64Compiler::GenerateStore(const vixl::aarch64::Register& addr_reg,
   1458                                                  const vixl::aarch64::Register& value_reg, MemoryAccessSize size,
   1459                                                  bool use_fastmem)
   1460 {
   1461   DebugAssert(addr_reg.IsW() && value_reg.IsW());
   1462   if (use_fastmem)
   1463   {
   1464     if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
   1465     {
   1466       DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
   1467       armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
   1468       armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));
   1469     }
   1470 
   1471     const MemOperand mem =
   1472       MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
   1473     u8* start = armAsm->GetCursorAddress<u8*>();
   1474     switch (size)
   1475     {
   1476       case MemoryAccessSize::Byte:
   1477         armAsm->strb(value_reg, mem);
   1478         break;
   1479 
   1480       case MemoryAccessSize::HalfWord:
   1481         armAsm->strh(value_reg, mem);
   1482         break;
   1483 
   1484       case MemoryAccessSize::Word:
   1485         armAsm->str(value_reg, mem);
   1486         break;
   1487     }
   1488     AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
   1489     return;
   1490   }
   1491 
   1492   if (addr_reg.GetCode() != RWARG1.GetCode())
   1493     armAsm->mov(RWARG1, addr_reg);
   1494   if (value_reg.GetCode() != RWARG2.GetCode())
   1495     armAsm->mov(RWARG2, value_reg);
   1496 
   1497   const bool checked = g_settings.cpu_recompiler_memory_exceptions;
   1498   switch (size)
   1499   {
   1500     case MemoryAccessSize::Byte:
   1501     {
   1502       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryByte) :
   1503                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
   1504     }
   1505     break;
   1506     case MemoryAccessSize::HalfWord:
   1507     {
   1508       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryHalfWord) :
   1509                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
   1510     }
   1511     break;
   1512     case MemoryAccessSize::Word:
   1513     {
   1514       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryWord) :
   1515                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
   1516     }
   1517     break;
   1518   }
   1519 
   1520   // TODO: turn this into an asm function instead
   1521   if (checked)
   1522   {
   1523     SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);
   1524     BackupHostState();
   1525 
   1526     // Need to stash this in a temp because of the flush.
   1527     const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
   1528     armAsm->lsl(temp, RWRET, 2);
   1529 
   1530     Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
   1531 
   1532     // cause_bits = (result << 2) | BD | cop_n
   1533     armAsm->orr(RWARG1, temp,
   1534                 armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
   1535                   static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
   1536     EmitMov(RWARG2, m_current_instruction_pc);
   1537     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   1538     FreeHostReg(temp.GetCode());
   1539     EndBlock(std::nullopt, true);
   1540 
   1541     RestoreHostState();
   1542     SwitchToNearCode(false);
   1543   }
   1544 }
   1545 
   1546 void CPU::NewRec::AArch64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1547                                                const std::optional<VirtualMemoryAddress>& address)
   1548 {
   1549   const std::optional<WRegister> addr_reg =
   1550     g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1551                                  std::optional<WRegister>();
   1552   FlushForLoadStore(address, false, use_fastmem);
   1553   const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1554   const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() -> Register {
   1555     if (cf.MipsT() == Reg::zero)
   1556       return RWRET;
   1557 
   1558     return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   1559                                      EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,
   1560                                      cf.MipsT()));
   1561   });
   1562 
   1563   if (g_settings.gpu_pgxp_enable)
   1564   {
   1565     Flush(FLUSH_FOR_C_CALL);
   1566 
   1567     EmitMov(RWARG1, inst->bits);
   1568     armAsm->mov(RWARG2, addr);
   1569     armAsm->mov(RWARG3, data);
   1570     EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
   1571     FreeHostReg(addr_reg.value().GetCode());
   1572   }
   1573 }
   1574 
   1575 void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1576                                                const std::optional<VirtualMemoryAddress>& address)
   1577 {
   1578   DebugAssert(size == MemoryAccessSize::Word && !sign);
   1579 
   1580   const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
   1581   FlushForLoadStore(address, false, use_fastmem);
   1582 
   1583   // TODO: if address is constant, this can be simplified..
   1584 
   1585   // If we're coming from another block, just flush the load delay and hope for the best..
   1586   if (m_load_delay_dirty)
   1587     UpdateLoadDelay();
   1588 
   1589   // We'd need to be careful here if we weren't overwriting it..
   1590   ComputeLoadStoreAddressArg(cf, address, addr);
   1591   armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
   1592   GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
   1593 
   1594   if (inst->r.rt == Reg::zero)
   1595   {
   1596     FreeHostReg(addr.GetCode());
   1597     return;
   1598   }
   1599 
   1600   // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
   1601   // never written back. NOTE: can't trust T in cf because of the flush
   1602   const Reg rt = inst->r.rt;
   1603   Register value;
   1604   if (m_load_delay_register == rt)
   1605   {
   1606     const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
   1607                                  AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
   1608                                  m_load_delay_value_register;
   1609     RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
   1610     value = WRegister(existing_ld_rt);
   1611   }
   1612   else
   1613   {
   1614     if constexpr (EMULATE_LOAD_DELAYS)
   1615     {
   1616       value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
   1617       if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
   1618         armAsm->mov(value, WRegister(rtreg.value()));
   1619       else if (HasConstantReg(rt))
   1620         EmitMov(value, GetConstantRegU32(rt));
   1621       else
   1622         armAsm->ldr(value, MipsPtr(rt));
   1623     }
   1624     else
   1625     {
   1626       value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
   1627     }
   1628   }
   1629 
   1630   DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
   1631   armAsm->and_(RWARG2, addr, 3);
   1632   armAsm->lsl(RWARG2, RWARG2, 3); // *8
   1633   EmitMov(RWARG3, 24);
   1634   armAsm->sub(RWARG3, RWARG3, RWARG2);
   1635 
   1636   if (inst->op == InstructionOp::lwl)
   1637   {
   1638     // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
   1639     // new_value = (value & mask) | (RWRET << (24 - shift));
   1640     EmitMov(RWSCRATCH, 0xFFFFFFu);
   1641     armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2);
   1642     armAsm->and_(value, value, RWSCRATCH);
   1643     armAsm->lslv(RWRET, RWRET, RWARG3);
   1644     armAsm->orr(value, value, RWRET);
   1645   }
   1646   else
   1647   {
   1648     // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
   1649     // new_value = (value & mask) | (RWRET >> shift);
   1650     armAsm->lsrv(RWRET, RWRET, RWARG2);
   1651     EmitMov(RWSCRATCH, 0xFFFFFF00u);
   1652     armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3);
   1653     armAsm->and_(value, value, RWSCRATCH);
   1654     armAsm->orr(value, value, RWRET);
   1655   }
   1656 
   1657   FreeHostReg(addr.GetCode());
   1658 
   1659   if (g_settings.gpu_pgxp_enable)
   1660   {
   1661     Flush(FLUSH_FOR_C_CALL);
   1662     armAsm->mov(RWARG3, value);
   1663     armAsm->and_(RWARG2, addr, armCheckLogicalConstant(~0x3u));
   1664     EmitMov(RWARG1, inst->bits);
   1665     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
   1666   }
   1667 }
   1668 
   1669 void CPU::NewRec::AArch64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1670                                                 const std::optional<VirtualMemoryAddress>& address)
   1671 {
   1672   const u32 index = static_cast<u32>(inst->r.rt.GetValue());
   1673   const auto [ptr, action] = GetGTERegisterPointer(index, true);
   1674   const std::optional<WRegister> addr_reg =
   1675     g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1676                                  std::optional<WRegister>();
   1677   FlushForLoadStore(address, false, use_fastmem);
   1678   const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1679   const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
   1680     return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
   1681              WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
   1682              RWRET;
   1683   });
   1684 
   1685   switch (action)
   1686   {
   1687     case GTERegisterAccessAction::Ignore:
   1688     {
   1689       break;
   1690     }
   1691 
   1692     case GTERegisterAccessAction::Direct:
   1693     {
   1694       armAsm->str(value, PTR(ptr));
   1695       break;
   1696     }
   1697 
   1698     case GTERegisterAccessAction::SignExtend16:
   1699     {
   1700       armAsm->sxth(RWARG3, value);
   1701       armAsm->str(RWARG3, PTR(ptr));
   1702       break;
   1703     }
   1704 
   1705     case GTERegisterAccessAction::ZeroExtend16:
   1706     {
   1707       armAsm->uxth(RWARG3, value);
   1708       armAsm->str(RWARG3, PTR(ptr));
   1709       break;
   1710     }
   1711 
   1712     case GTERegisterAccessAction::CallHandler:
   1713     {
   1714       Flush(FLUSH_FOR_C_CALL);
   1715       armAsm->mov(RWARG2, value);
   1716       EmitMov(RWARG1, index);
   1717       EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
   1718       break;
   1719     }
   1720 
   1721     case GTERegisterAccessAction::PushFIFO:
   1722     {
   1723       // SXY0 <- SXY1
   1724       // SXY1 <- SXY2
   1725       // SXY2 <- SXYP
   1726       DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
   1727       armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
   1728       armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
   1729       armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
   1730       armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
   1731       armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));
   1732       break;
   1733     }
   1734 
   1735     default:
   1736     {
   1737       Panic("Unknown action");
   1738       return;
   1739     }
   1740   }
   1741 
   1742   if (g_settings.gpu_pgxp_enable)
   1743   {
   1744     Flush(FLUSH_FOR_C_CALL);
   1745     armAsm->mov(RWARG3, value);
   1746     if (value.GetCode() != RWRET.GetCode())
   1747       FreeHostReg(value.GetCode());
   1748     armAsm->mov(RWARG2, addr);
   1749     FreeHostReg(addr_reg.value().GetCode());
   1750     EmitMov(RWARG1, inst->bits);
   1751     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
   1752   }
   1753 }
   1754 
   1755 void CPU::NewRec::AArch64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1756                                                const std::optional<VirtualMemoryAddress>& address)
   1757 {
   1758   AssertRegOrConstS(cf);
   1759   AssertRegOrConstT(cf);
   1760 
   1761   const std::optional<WRegister> addr_reg =
   1762     g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1763                                  std::optional<WRegister>();
   1764   FlushForLoadStore(address, true, use_fastmem);
   1765   const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1766   const Register data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
   1767   if (!cf.valid_host_t)
   1768     MoveTToReg(RWARG2, cf);
   1769 
   1770   GenerateStore(addr, data, size, use_fastmem);
   1771 
   1772   if (g_settings.gpu_pgxp_enable)
   1773   {
   1774     Flush(FLUSH_FOR_C_CALL);
   1775     MoveMIPSRegToReg(RWARG3, cf.MipsT());
   1776     armAsm->mov(RWARG2, addr);
   1777     EmitMov(RWARG1, inst->bits);
   1778     EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
   1779     FreeHostReg(addr_reg.value().GetCode());
   1780   }
   1781 }
   1782 
   1783 void CPU::NewRec::AArch64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1784                                                const std::optional<VirtualMemoryAddress>& address)
   1785 {
   1786   DebugAssert(size == MemoryAccessSize::Word && !sign);
   1787 
   1788   // TODO: this can take over rt's value if it's no longer needed
   1789   // NOTE: can't trust T in cf because of the alloc
   1790   const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
   1791   const Register value = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
   1792   if (g_settings.gpu_pgxp_enable)
   1793     MoveMIPSRegToReg(value, inst->r.rt);
   1794 
   1795   FlushForLoadStore(address, true, use_fastmem);
   1796 
   1797   // TODO: if address is constant, this can be simplified..
   1798   // We'd need to be careful here if we weren't overwriting it..
   1799   ComputeLoadStoreAddressArg(cf, address, addr);
   1800   armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
   1801   GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
   1802 
   1803   armAsm->and_(RWSCRATCH, addr, 3);
   1804   armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8
   1805   armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));
   1806 
   1807   // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
   1808   if (!g_settings.gpu_pgxp_enable)
   1809     MoveMIPSRegToReg(value, inst->r.rt);
   1810 
   1811   if (inst->op == InstructionOp::swl)
   1812   {
   1813     // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
   1814     // new_value = (RWRET & mem_mask) | (value >> (24 - shift));
   1815     EmitMov(RWARG3, 0xFFFFFF00u);
   1816     armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);
   1817     armAsm->and_(RWRET, RWRET, RWARG3);
   1818 
   1819     EmitMov(RWARG3, 24);
   1820     armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
   1821     armAsm->lsrv(value, value, RWARG3);
   1822     armAsm->orr(value, value, RWRET);
   1823   }
   1824   else
   1825   {
   1826     // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
   1827     // new_value = (RWRET & mem_mask) | (value << shift);
   1828     armAsm->lslv(value, value, RWSCRATCH);
   1829 
   1830     EmitMov(RWARG3, 24);
   1831     armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
   1832     EmitMov(RWSCRATCH, 0x00FFFFFFu);
   1833     armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);
   1834     armAsm->and_(RWRET, RWRET, RWSCRATCH);
   1835     armAsm->orr(value, value, RWRET);
   1836   }
   1837 
   1838   if (!g_settings.gpu_pgxp_enable)
   1839   {
   1840     GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
   1841     FreeHostReg(addr.GetCode());
   1842   }
   1843   else
   1844   {
   1845     GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
   1846 
   1847     Flush(FLUSH_FOR_C_CALL);
   1848     armAsm->mov(RWARG3, value);
   1849     FreeHostReg(value.GetCode());
   1850     armAsm->mov(RWARG2, addr);
   1851     FreeHostReg(addr.GetCode());
   1852     EmitMov(RWARG1, inst->bits);
   1853     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
   1854   }
   1855 }
   1856 
   1857 void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1858                                                 const std::optional<VirtualMemoryAddress>& address)
   1859 {
   1860   const u32 index = static_cast<u32>(inst->r.rt.GetValue());
   1861   const auto [ptr, action] = GetGTERegisterPointer(index, false);
   1862   const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
   1863                           WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
   1864                           RWARG1;
   1865   const Register data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
   1866   FlushForLoadStore(address, true, use_fastmem);
   1867   ComputeLoadStoreAddressArg(cf, address, addr);
   1868 
   1869   switch (action)
   1870   {
   1871     case GTERegisterAccessAction::Direct:
   1872     {
   1873       armAsm->ldr(data, PTR(ptr));
   1874     }
   1875     break;
   1876 
   1877     case GTERegisterAccessAction::CallHandler:
   1878     {
   1879       // should already be flushed.. except in fastmem case
   1880       Flush(FLUSH_FOR_C_CALL);
   1881       EmitMov(RWARG1, index);
   1882       EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
   1883       armAsm->mov(data, RWRET);
   1884     }
   1885     break;
   1886 
   1887     default:
   1888     {
   1889       Panic("Unknown action");
   1890     }
   1891     break;
   1892   }
   1893 
   1894   GenerateStore(addr, data, size, use_fastmem);
   1895   if (!g_settings.gpu_pgxp_enable)
   1896   {
   1897     if (addr.GetCode() != RWARG1.GetCode())
   1898       FreeHostReg(addr.GetCode());
   1899   }
   1900   else
   1901   {
   1902     // TODO: This can be simplified because we don't need to validate in PGXP..
   1903     Flush(FLUSH_FOR_C_CALL);
   1904     armAsm->mov(RWARG3, data);
   1905     FreeHostReg(data.GetCode());
   1906     armAsm->mov(RWARG2, addr);
   1907     FreeHostReg(addr.GetCode());
   1908     EmitMov(RWARG1, inst->bits);
   1909     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
   1910   }
   1911 }
   1912 
   1913 void CPU::NewRec::AArch64Compiler::Compile_mtc0(CompileFlags cf)
   1914 {
   1915   // TODO: we need better constant setting here.. which will need backprop
   1916   AssertRegOrConstT(cf);
   1917 
   1918   const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
   1919   const u32* ptr = GetCop0RegPtr(reg);
   1920   const u32 mask = GetCop0RegWriteMask(reg);
   1921   if (!ptr)
   1922   {
   1923     Compile_Fallback();
   1924     return;
   1925   }
   1926 
   1927   if (mask == 0)
   1928   {
   1929     // if it's a read-only register, ignore
   1930     DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
   1931     return;
   1932   }
   1933 
   1934   // for some registers, we need to test certain bits
   1935   const bool needs_bit_test = (reg == Cop0Reg::SR);
   1936   const Register new_value = RWARG1;
   1937   const Register old_value = RWARG2;
   1938   const Register changed_bits = RWARG3;
   1939   const Register mask_reg = RWSCRATCH;
   1940 
   1941   // Load old value
   1942   armAsm->ldr(old_value, PTR(ptr));
   1943 
   1944   // No way we fit this in an immediate..
   1945   EmitMov(mask_reg, mask);
   1946 
   1947   // update value
   1948   if (cf.valid_host_t)
   1949     armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
   1950   else
   1951     EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
   1952 
   1953   if (needs_bit_test)
   1954     armAsm->eor(changed_bits, old_value, new_value);
   1955   armAsm->bic(old_value, old_value, mask_reg);
   1956   armAsm->orr(new_value, old_value, new_value);
   1957   armAsm->str(new_value, PTR(ptr));
   1958 
   1959   if (reg == Cop0Reg::SR)
   1960   {
   1961     // TODO: replace with register backup
   1962     // We could just inline the whole thing..
   1963     Flush(FLUSH_FOR_C_CALL);
   1964 
   1965     SwitchToFarCodeIfBitSet(changed_bits, 16);
   1966     armAsm->sub(sp, sp, 16);
   1967     armAsm->str(RWARG1, MemOperand(sp));
   1968     EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
   1969     armAsm->ldr(RWARG1, MemOperand(sp));
   1970     armAsm->add(sp, sp, 16);
   1971     armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
   1972     SwitchToNearCode(true);
   1973 
   1974     TestInterrupts(RWARG1);
   1975   }
   1976   else if (reg == Cop0Reg::CAUSE)
   1977   {
   1978     armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
   1979     TestInterrupts(RWARG1);
   1980   }
   1981 
   1982   if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions)
   1983   {
   1984     // TODO: DCIC handling for debug breakpoints
   1985     WARNING_LOG("TODO: DCIC handling for debug breakpoints");
   1986   }
   1987 }
   1988 
   1989 void CPU::NewRec::AArch64Compiler::Compile_rfe(CompileFlags cf)
   1990 {
   1991   // shift mode bits right two, preserving upper bits
   1992   armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
   1993   armAsm->bfxil(RWARG1, RWARG1, 2, 4);
   1994   armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
   1995 
   1996   TestInterrupts(RWARG1);
   1997 }
   1998 
   1999 void CPU::NewRec::AArch64Compiler::TestInterrupts(const vixl::aarch64::Register& sr)
   2000 {
   2001   DebugAssert(sr.IsW());
   2002 
   2003   // if Iec == 0 then goto no_interrupt
   2004   Label no_interrupt;
   2005   armAsm->tbz(sr, 0, &no_interrupt);
   2006 
   2007   // sr & cause
   2008   armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
   2009   armAsm->and_(sr, sr, RWSCRATCH);
   2010 
   2011   // ((sr & cause) & 0xff00) == 0 goto no_interrupt
   2012   armAsm->tst(sr, 0xFF00);
   2013 
   2014   SwitchToFarCode(true, ne);
   2015   BackupHostState();
   2016 
   2017   // Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
   2018   UpdateLoadDelay();
   2019 
   2020   Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
   2021 
   2022   // Can't use EndBlockWithException() here, because it'll use the wrong PC.
   2023   // Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
   2024   if (!iinfo->is_last_instruction)
   2025   {
   2026     EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
   2027                                                                 (inst + 1)->cop.cop_n));
   2028     EmitMov(RWARG2, m_compiler_pc);
   2029     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   2030     m_dirty_pc = false;
   2031     EndAndLinkBlock(std::nullopt, true, false);
   2032   }
   2033   else
   2034   {
   2035     if (m_dirty_pc)
   2036       EmitMov(RWARG1, m_compiler_pc);
   2037     armAsm->str(wzr, PTR(&g_state.downcount));
   2038     if (m_dirty_pc)
   2039       armAsm->str(RWARG1, PTR(&g_state.pc));
   2040     m_dirty_pc = false;
   2041     EndAndLinkBlock(std::nullopt, false, true);
   2042   }
   2043 
   2044   RestoreHostState();
   2045   SwitchToNearCode(false);
   2046 
   2047   armAsm->bind(&no_interrupt);
   2048 }
   2049 
   2050 void CPU::NewRec::AArch64Compiler::Compile_mfc2(CompileFlags cf)
   2051 {
   2052   const u32 index = inst->cop.Cop2Index();
   2053   const Reg rt = inst->r.rt;
   2054 
   2055   const auto [ptr, action] = GetGTERegisterPointer(index, false);
   2056   if (action == GTERegisterAccessAction::Ignore)
   2057     return;
   2058 
   2059   u32 hreg;
   2060   if (action == GTERegisterAccessAction::Direct)
   2061   {
   2062     hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   2063                            EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
   2064     armAsm->ldr(WRegister(hreg), PTR(ptr));
   2065   }
   2066   else if (action == GTERegisterAccessAction::CallHandler)
   2067   {
   2068     Flush(FLUSH_FOR_C_CALL);
   2069     EmitMov(RWARG1, index);
   2070     EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
   2071 
   2072     hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   2073                            EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
   2074     armAsm->mov(WRegister(hreg), RWRET);
   2075   }
   2076   else
   2077   {
   2078     Panic("Unknown action");
   2079     return;
   2080   }
   2081 
   2082   if (g_settings.gpu_pgxp_enable)
   2083   {
   2084     Flush(FLUSH_FOR_C_CALL);
   2085     EmitMov(RWARG1, inst->bits);
   2086     armAsm->mov(RWARG2, WRegister(hreg));
   2087     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
   2088   }
   2089 }
   2090 
   2091 void CPU::NewRec::AArch64Compiler::Compile_mtc2(CompileFlags cf)
   2092 {
   2093   const u32 index = inst->cop.Cop2Index();
   2094   const auto [ptr, action] = GetGTERegisterPointer(index, true);
   2095   if (action == GTERegisterAccessAction::Ignore)
   2096     return;
   2097 
   2098   if (action == GTERegisterAccessAction::Direct)
   2099   {
   2100     if (cf.const_t)
   2101       StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
   2102     else
   2103       armAsm->str(CFGetRegT(cf), PTR(ptr));
   2104   }
   2105   else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
   2106   {
   2107     const bool sign = (action == GTERegisterAccessAction::SignExtend16);
   2108     if (cf.valid_host_t)
   2109     {
   2110       sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));
   2111       armAsm->str(RWARG1, PTR(ptr));
   2112     }
   2113     else if (cf.const_t)
   2114     {
   2115       const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
   2116       StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
   2117     }
   2118     else
   2119     {
   2120       Panic("Unsupported setup");
   2121     }
   2122   }
   2123   else if (action == GTERegisterAccessAction::CallHandler)
   2124   {
   2125     Flush(FLUSH_FOR_C_CALL);
   2126     EmitMov(RWARG1, index);
   2127     MoveTToReg(RWARG2, cf);
   2128     EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
   2129   }
   2130   else if (action == GTERegisterAccessAction::PushFIFO)
   2131   {
   2132     // SXY0 <- SXY1
   2133     // SXY1 <- SXY2
   2134     // SXY2 <- SXYP
   2135     DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());
   2136     armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
   2137     armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
   2138     armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
   2139     armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
   2140     if (cf.valid_host_t)
   2141       armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
   2142     else if (cf.const_t)
   2143       StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
   2144     else
   2145       Panic("Unsupported setup");
   2146   }
   2147   else
   2148   {
   2149     Panic("Unknown action");
   2150   }
   2151 }
   2152 
   2153 void CPU::NewRec::AArch64Compiler::Compile_cop2(CompileFlags cf)
   2154 {
   2155   TickCount func_ticks;
   2156   GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
   2157 
   2158   Flush(FLUSH_FOR_C_CALL);
   2159   EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
   2160   EmitCall(reinterpret_cast<const void*>(func));
   2161 
   2162   AddGTETicks(func_ticks);
   2163 }
   2164 
   2165 u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
   2166                                        TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
   2167                                        u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
   2168                                        bool is_load)
   2169 {
   2170   Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
   2171   Assembler* armAsm = &arm_asm;
   2172 
   2173 #ifdef VIXL_DEBUG
   2174   vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
   2175 #endif
   2176 
   2177   static constexpr u32 GPR_SIZE = 8;
   2178 
   2179   // save regs
   2180   u32 num_gprs = 0;
   2181 
   2182   for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2183   {
   2184     if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
   2185       num_gprs++;
   2186   }
   2187 
   2188   const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
   2189 
   2190   // TODO: use stp+ldp, vixl helper?
   2191 
   2192   if (stack_size > 0)
   2193   {
   2194     armAsm->sub(sp, sp, stack_size);
   2195 
   2196     u32 stack_offset = 0;
   2197     for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2198     {
   2199       if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
   2200       {
   2201         armAsm->str(XRegister(i), MemOperand(sp, stack_offset));
   2202         stack_offset += GPR_SIZE;
   2203       }
   2204     }
   2205   }
   2206 
   2207   if (cycles_to_add != 0)
   2208   {
   2209     // NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
   2210     Assert(Assembler::IsImmAddSub(cycles_to_add));
   2211     armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
   2212     armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);
   2213     armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
   2214   }
   2215 
   2216   if (address_register != static_cast<u8>(RWARG1.GetCode()))
   2217     armAsm->mov(RWARG1, WRegister(address_register));
   2218 
   2219   if (!is_load)
   2220   {
   2221     if (data_register != static_cast<u8>(RWARG2.GetCode()))
   2222       armAsm->mov(RWARG2, WRegister(data_register));
   2223   }
   2224 
   2225   switch (size)
   2226   {
   2227     case MemoryAccessSize::Byte:
   2228     {
   2229       armEmitCall(armAsm,
   2230                   is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte) :
   2231                             reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte),
   2232                   false);
   2233     }
   2234     break;
   2235     case MemoryAccessSize::HalfWord:
   2236     {
   2237       armEmitCall(armAsm,
   2238                   is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) :
   2239                             reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord),
   2240                   false);
   2241     }
   2242     break;
   2243     case MemoryAccessSize::Word:
   2244     {
   2245       armEmitCall(armAsm,
   2246                   is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord) :
   2247                             reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord),
   2248                   false);
   2249     }
   2250     break;
   2251   }
   2252 
   2253   if (is_load)
   2254   {
   2255     const WRegister dst = WRegister(data_register);
   2256     switch (size)
   2257     {
   2258       case MemoryAccessSize::Byte:
   2259       {
   2260         is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET);
   2261       }
   2262       break;
   2263       case MemoryAccessSize::HalfWord:
   2264       {
   2265         is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET);
   2266       }
   2267       break;
   2268       case MemoryAccessSize::Word:
   2269       {
   2270         if (dst.GetCode() != RWRET.GetCode())
   2271           armAsm->mov(dst, RWRET);
   2272       }
   2273       break;
   2274     }
   2275   }
   2276 
   2277   if (cycles_to_remove != 0)
   2278   {
   2279     Assert(Assembler::IsImmAddSub(cycles_to_remove));
   2280     armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
   2281     armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove);
   2282     armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
   2283   }
   2284 
   2285   // restore regs
   2286   if (stack_size > 0)
   2287   {
   2288     u32 stack_offset = 0;
   2289     for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2290     {
   2291       if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
   2292       {
   2293         armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset));
   2294         stack_offset += GPR_SIZE;
   2295       }
   2296     }
   2297 
   2298     armAsm->add(sp, sp, stack_size);
   2299   }
   2300 
   2301   armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);
   2302   armAsm->FinalizeCode();
   2303 
   2304   return static_cast<u32>(armAsm->GetCursorOffset());
   2305 }
   2306 
   2307 #endif // CPU_ARCH_ARM64