duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

cpu_newrec_compiler_aarch32.cpp (70516B)


      1 // SPDX-FileCopyrightText: 2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "cpu_newrec_compiler_aarch32.h"
      5 #include "common/align.h"
      6 #include "common/assert.h"
      7 #include "common/log.h"
      8 #include "common/string_util.h"
      9 #include "cpu_core_private.h"
     10 #include "cpu_pgxp.h"
     11 #include "cpu_recompiler_thunks.h"
     12 #include "cpu_recompiler_types.h"
     13 #include "gte.h"
     14 #include "settings.h"
     15 #include "timing_event.h"
     16 #include <limits>
     17 
     18 #ifdef CPU_ARCH_ARM32
     19 
     20 Log_SetChannel(CPU::NewRec);
     21 
     22 #define PTR(x) vixl::aarch32::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))
     23 #define RMEMBASE vixl::aarch32::r3
     24 
     25 namespace CPU::NewRec {
     26 
     27 using namespace vixl::aarch32;
     28 
     29 using CPU::Recompiler::armEmitCall;
     30 using CPU::Recompiler::armEmitCondBranch;
     31 using CPU::Recompiler::armEmitFarLoad;
     32 using CPU::Recompiler::armEmitJmp;
     33 using CPU::Recompiler::armEmitMov;
     34 using CPU::Recompiler::armGetJumpTrampoline;
     35 using CPU::Recompiler::armGetPCDisplacement;
     36 using CPU::Recompiler::armIsCallerSavedRegister;
     37 using CPU::Recompiler::armIsPCDisplacementInImmediateRange;
     38 using CPU::Recompiler::armMoveAddressToReg;
     39 
     40 AArch32Compiler s_instance;
     41 Compiler* g_compiler = &s_instance;
     42 
     43 } // namespace CPU::NewRec
     44 
     45 CPU::NewRec::AArch32Compiler::AArch32Compiler() : m_emitter(A32), m_far_emitter(A32)
     46 {
     47 }
     48 
     49 CPU::NewRec::AArch32Compiler::~AArch32Compiler() = default;
     50 
     51 const void* CPU::NewRec::AArch32Compiler::GetCurrentCodePointer()
     52 {
     53   return armAsm->GetCursorAddress<const void*>();
     54 }
     55 
     56 void CPU::NewRec::AArch32Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
     57                                          u8* far_code_buffer, u32 far_code_space)
     58 {
     59   Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
     60 
     61   // TODO: don't recreate this every time..
     62   DebugAssert(!armAsm);
     63   m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);
     64   m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);
     65   armAsm = &m_emitter;
     66 
     67 #ifdef VIXL_DEBUG
     68   m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(m_emitter.get(), code_buffer_space,
     69                                                                  vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
     70   m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(
     71     m_far_emitter.get(), far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
     72 #endif
     73 
     74   // Need to wipe it out so it's correct when toggling fastmem.
     75   m_host_regs = {};
     76 
     77   const u32 membase_idx =
     78     (CodeCache::IsUsingFastmem() && block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) ?
     79       RMEMBASE.GetCode() :
     80       NUM_HOST_REGS;
     81   for (u32 i = 0; i < NUM_HOST_REGS; i++)
     82   {
     83     HostRegAlloc& ra = m_host_regs[i];
     84 
     85     if (i == RARG1.GetCode() || i == RARG2.GetCode() || i == RARG3.GetCode() || i == RSCRATCH.GetCode() ||
     86         i == RSTATE.GetCode() || i == membase_idx || i == sp.GetCode() || i == pc.GetCode())
     87     {
     88       continue;
     89     }
     90 
     91     ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
     92   }
     93 }
     94 
     95 void CPU::NewRec::AArch32Compiler::SwitchToFarCode(bool emit_jump, vixl::aarch32::ConditionType cond)
     96 {
     97   DebugAssert(armAsm == &m_emitter);
     98   if (emit_jump)
     99   {
    100     const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
    101     if (armIsPCDisplacementInImmediateRange(disp))
    102     {
    103       Label ldisp(armAsm->GetCursorOffset() + disp);
    104       armAsm->b(cond, &ldisp);
    105     }
    106     else if (cond != vixl::aarch32::al)
    107     {
    108       Label skip;
    109       armAsm->b(Condition(cond).Negate(), &skip);
    110       armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
    111       armAsm->bind(&skip);
    112     }
    113     else
    114     {
    115       armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
    116     }
    117   }
    118   armAsm = &m_far_emitter;
    119 }
    120 
    121 void CPU::NewRec::AArch32Compiler::SwitchToFarCodeIfBitSet(const vixl::aarch32::Register& reg, u32 bit)
    122 {
    123   armAsm->tst(reg, 1u << bit);
    124 
    125   const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
    126   if (armIsPCDisplacementInImmediateRange(disp))
    127   {
    128     Label ldisp(armAsm->GetCursorOffset() + disp);
    129     armAsm->b(ne, &ldisp);
    130   }
    131   else
    132   {
    133     Label skip;
    134     armAsm->b(eq, &skip);
    135     armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
    136     armAsm->bind(&skip);
    137   }
    138 
    139   armAsm = &m_far_emitter;
    140 }
    141 
    142 void CPU::NewRec::AArch32Compiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch32::Register& reg, bool nonzero)
    143 {
    144   armAsm->cmp(reg, 0);
    145 
    146   const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
    147   if (armIsPCDisplacementInImmediateRange(disp))
    148   {
    149     Label ldisp(armAsm->GetCursorOffset() + disp);
    150     nonzero ? armAsm->b(ne, &ldisp) : armAsm->b(eq, &ldisp);
    151   }
    152   else
    153   {
    154     Label skip;
    155     nonzero ? armAsm->b(eq, &skip) : armAsm->b(ne, &skip);
    156     armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
    157     armAsm->bind(&skip);
    158   }
    159 
    160   armAsm = &m_far_emitter;
    161 }
    162 
    163 void CPU::NewRec::AArch32Compiler::SwitchToNearCode(bool emit_jump, vixl::aarch32::ConditionType cond)
    164 {
    165   DebugAssert(armAsm == &m_far_emitter);
    166   if (emit_jump)
    167   {
    168     const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());
    169     if (armIsPCDisplacementInImmediateRange(disp))
    170     {
    171       Label ldisp(armAsm->GetCursorOffset() + disp);
    172       armAsm->b(cond, &ldisp);
    173     }
    174     else if (cond != vixl::aarch32::al)
    175     {
    176       Label skip;
    177       armAsm->b(Condition(cond).Negate(), &skip);
    178       armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
    179       armAsm->bind(&skip);
    180     }
    181     else
    182     {
    183       armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
    184     }
    185   }
    186   armAsm = &m_emitter;
    187 }
    188 
    189 void CPU::NewRec::AArch32Compiler::EmitMov(const vixl::aarch32::Register& dst, u32 val)
    190 {
    191   armEmitMov(armAsm, dst, val);
    192 }
    193 
    194 void CPU::NewRec::AArch32Compiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
    195 {
    196   armEmitCall(armAsm, ptr, force_inline);
    197 }
    198 
    199 vixl::aarch32::Operand CPU::NewRec::AArch32Compiler::armCheckAddSubConstant(s32 val)
    200 {
    201   if (ImmediateA32::IsImmediateA32(static_cast<u32>(val)))
    202     return vixl::aarch32::Operand(static_cast<int32_t>(val));
    203 
    204   EmitMov(RSCRATCH, static_cast<u32>(val));
    205   return vixl::aarch32::Operand(RSCRATCH);
    206 }
    207 
    208 vixl::aarch32::Operand CPU::NewRec::AArch32Compiler::armCheckAddSubConstant(u32 val)
    209 {
    210   return armCheckAddSubConstant(static_cast<s32>(val));
    211 }
    212 
    213 vixl::aarch32::Operand CPU::NewRec::AArch32Compiler::armCheckCompareConstant(s32 val)
    214 {
    215   return armCheckAddSubConstant(val);
    216 }
    217 
    218 vixl::aarch32::Operand CPU::NewRec::AArch32Compiler::armCheckLogicalConstant(u32 val)
    219 {
    220   return armCheckAddSubConstant(val);
    221 }
    222 
    223 void CPU::NewRec::AArch32Compiler::BeginBlock()
    224 {
    225   Compiler::BeginBlock();
    226 }
    227 
    228 void CPU::NewRec::AArch32Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
    229 {
    230   // store it first to reduce code size, because we can offset
    231   armMoveAddressToReg(armAsm, RARG1, ram_ptr);
    232   armMoveAddressToReg(armAsm, RARG2, shadow_ptr);
    233 
    234   u32 offset = 0;
    235   Label block_changed;
    236 
    237 #if 0
    238   /* TODO: Vectorize
    239 #include <arm_neon.h>
    240 #include <stdint.h>
    241 
    242 bool foo(const void* a, const void* b)
    243 {
    244     uint8x16_t v1 = vld1q_u8((const uint8_t*)a);
    245     uint8x16_t v2 = vld1q_u8((const uint8_t*)b);
    246     uint8x16_t v3 = vld1q_u8((const uint8_t*)a + 16);
    247     uint8x16_t v4 = vld1q_u8((const uint8_t*)a + 16);
    248     uint8x16_t r = vceqq_u8(v1, v2);
    249     uint8x16_t r2 = vceqq_u8(v2, v3);
    250     uint8x16_t r3 = vandq_u8(r, r2);
    251     uint32x2_t rr = vpmin_u32(vget_low_u32(vreinterpretq_u32_u8(r3)), vget_high_u32(vreinterpretq_u32_u8(r3)));
    252     if ((vget_lane_u32(rr, 0) & vget_lane_u32(rr, 1)) != 0xFFFFFFFFu)
    253         return false;
    254     else
    255         return true;
    256 }
    257 */
    258   bool first = true;
    259 
    260   while (size >= 16)
    261   {
    262     const VRegister vtmp = a32::v2.V4S();
    263     const VRegister dst = first ? a32::v0.V4S() : a32::v1.V4S();
    264     m_emit->ldr(dst, a32::MemOperand(RXARG1, offset));
    265     m_emit->ldr(vtmp, a32::MemOperand(RXARG2, offset));
    266     m_emit->cmeq(dst, dst, vtmp);
    267     if (!first)
    268       m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B());
    269     else
    270       first = false;
    271 
    272     offset += 16;
    273     size -= 16;
    274   }
    275 
    276   if (!first)
    277   {
    278     // TODO: make sure this doesn't choke on ffffffff
    279     armAsm->uminv(a32::s0, a32::v0.V4S());
    280     armAsm->fcmp(a32::s0, 0.0);
    281     armAsm->b(&block_changed, a32::eq);
    282   }
    283 #endif
    284 
    285   while (size >= 4)
    286   {
    287     armAsm->ldr(RARG3, MemOperand(RARG1, offset));
    288     armAsm->ldr(RSCRATCH, MemOperand(RARG2, offset));
    289     armAsm->cmp(RARG3, RSCRATCH);
    290     armAsm->b(ne, &block_changed);
    291     offset += 4;
    292     size -= 4;
    293   }
    294 
    295   DebugAssert(size == 0);
    296 
    297   Label block_unchanged;
    298   armAsm->b(&block_unchanged);
    299   armAsm->bind(&block_changed);
    300   armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
    301   armAsm->bind(&block_unchanged);
    302 }
    303 
    304 void CPU::NewRec::AArch32Compiler::GenerateICacheCheckAndUpdate()
    305 {
    306   if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
    307   {
    308     if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
    309     {
    310       armEmitFarLoad(armAsm, RARG2, GetFetchMemoryAccessTimePtr());
    311       armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
    312       armEmitMov(armAsm, RARG3, m_block->size);
    313       armAsm->mul(RARG2, RARG2, RARG3);
    314       armAsm->add(RARG1, RARG1, RARG2);
    315       armAsm->str(RARG1, PTR(&g_state.pending_ticks));
    316     }
    317     else
    318     {
    319       armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
    320       armAsm->add(RARG1, RARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));
    321       armAsm->str(RARG1, PTR(&g_state.pending_ticks));
    322     }
    323   }
    324   else if (m_block->icache_line_count > 0)
    325   {
    326     const auto& ticks_reg = RARG1;
    327     const auto& current_tag_reg = RARG2;
    328     const auto& existing_tag_reg = RARG3;
    329 
    330     VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
    331     armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
    332     armEmitMov(armAsm, current_tag_reg, current_pc);
    333 
    334     for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
    335     {
    336       const TickCount fill_ticks = GetICacheFillTicks(current_pc);
    337       if (fill_ticks <= 0)
    338         continue;
    339 
    340       const u32 line = GetICacheLine(current_pc);
    341       const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
    342 
    343       Label cache_hit;
    344       armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
    345       armAsm->cmp(existing_tag_reg, current_tag_reg);
    346       armAsm->b(eq, &cache_hit);
    347 
    348       armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
    349       armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast<u32>(fill_ticks)));
    350       armAsm->bind(&cache_hit);
    351 
    352       if (i != (m_block->icache_line_count - 1))
    353         armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
    354     }
    355 
    356     armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
    357   }
    358 }
    359 
    360 void CPU::NewRec::AArch32Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
    361                                                 s32 arg3reg /*= -1*/)
    362 {
    363   if (arg1reg >= 0 && arg1reg != static_cast<s32>(RARG1.GetCode()))
    364     armAsm->mov(RARG1, Register(arg1reg));
    365   if (arg2reg >= 0 && arg2reg != static_cast<s32>(RARG2.GetCode()))
    366     armAsm->mov(RARG2, Register(arg2reg));
    367   if (arg3reg >= 0 && arg3reg != static_cast<s32>(RARG3.GetCode()))
    368     armAsm->mov(RARG3, Register(arg3reg));
    369   EmitCall(func);
    370 }
    371 
    372 void CPU::NewRec::AArch32Compiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
    373 {
    374   if (newpc.has_value())
    375   {
    376     if (m_dirty_pc || m_compiler_pc != newpc)
    377     {
    378       EmitMov(RSCRATCH, newpc.value());
    379       armAsm->str(RSCRATCH, PTR(&g_state.pc));
    380     }
    381   }
    382   m_dirty_pc = false;
    383 
    384   // flush regs
    385   Flush(FLUSH_END_BLOCK);
    386   EndAndLinkBlock(newpc, do_event_test, false);
    387 }
    388 
    389 void CPU::NewRec::AArch32Compiler::EndBlockWithException(Exception excode)
    390 {
    391   // flush regs, but not pc, it's going to get overwritten
    392   // flush cycles because of the GTE instruction stuff...
    393   Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
    394 
    395   // TODO: flush load delay
    396   // TODO: break for pcdrv
    397 
    398   EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
    399                                                              inst->cop.cop_n));
    400   EmitMov(RARG2, m_current_instruction_pc);
    401   EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
    402   m_dirty_pc = false;
    403 
    404   EndAndLinkBlock(std::nullopt, true, false);
    405 }
    406 
    407 void CPU::NewRec::AArch32Compiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test,
    408                                                    bool force_run_events)
    409 {
    410   // event test
    411   // pc should've been flushed
    412   DebugAssert(!m_dirty_pc && !m_block_ended);
    413   m_block_ended = true;
    414 
    415   // TODO: try extracting this to a function
    416 
    417   // save cycles for event test
    418   const TickCount cycles = std::exchange(m_cycles, 0);
    419 
    420   // pending_ticks += cycles
    421   // if (pending_ticks >= downcount) { dispatch_event(); }
    422   if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
    423     armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
    424   if (do_event_test)
    425     armAsm->ldr(RARG2, PTR(&g_state.downcount));
    426   if (cycles > 0)
    427     armAsm->add(RARG1, RARG1, armCheckAddSubConstant(cycles));
    428   if (m_gte_done_cycle > cycles)
    429   {
    430     armAsm->add(RARG2, RARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
    431     armAsm->str(RARG2, PTR(&g_state.gte_completion_tick));
    432   }
    433   if (do_event_test)
    434     armAsm->cmp(RARG1, RARG2);
    435   if (cycles > 0)
    436     armAsm->str(RARG1, PTR(&g_state.pending_ticks));
    437   if (do_event_test)
    438     armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
    439 
    440   // jump to dispatcher or next block
    441   if (force_run_events)
    442   {
    443     armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);
    444   }
    445   else if (!newpc.has_value())
    446   {
    447     armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
    448   }
    449   else
    450   {
    451     if (newpc.value() == m_block->pc)
    452     {
    453       // Special case: ourselves! No need to backlink then.
    454       DEBUG_LOG("Linking block at {:08X} to self", m_block->pc);
    455       armEmitJmp(armAsm, armAsm->GetBuffer()->GetStartAddress<const void*>(), true);
    456     }
    457     else
    458     {
    459       const void* target = CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());
    460       armEmitJmp(armAsm, target, true);
    461     }
    462   }
    463 }
    464 
    465 const void* CPU::NewRec::AArch32Compiler::EndCompile(u32* code_size, u32* far_code_size)
    466 {
    467 #ifdef VIXL_DEBUG
    468   m_emitter_check.reset();
    469   m_far_emitter_check.reset();
    470 #endif
    471 
    472   m_emitter.FinalizeCode();
    473   m_far_emitter.FinalizeCode();
    474 
    475   u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();
    476   *code_size = static_cast<u32>(m_emitter.GetCursorOffset());
    477   *far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());
    478   armAsm = nullptr;
    479   return code;
    480 }
    481 
    482 const char* CPU::NewRec::AArch32Compiler::GetHostRegName(u32 reg) const
    483 {
    484   static constexpr std::array<const char*, 32> reg64_names = {
    485     {"x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",  "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
    486      "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp",  "lr",  "sp"}};
    487   return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
    488 }
    489 
    490 void CPU::NewRec::AArch32Compiler::LoadHostRegWithConstant(u32 reg, u32 val)
    491 {
    492   EmitMov(Register(reg), val);
    493 }
    494 
    495 void CPU::NewRec::AArch32Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
    496 {
    497   armAsm->ldr(Register(reg), PTR(ptr));
    498 }
    499 
    500 void CPU::NewRec::AArch32Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
    501 {
    502   armAsm->str(Register(reg), PTR(ptr));
    503 }
    504 
    505 void CPU::NewRec::AArch32Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
    506 {
    507   EmitMov(RSCRATCH, val);
    508   armAsm->str(RSCRATCH, PTR(ptr));
    509 }
    510 
    511 void CPU::NewRec::AArch32Compiler::CopyHostReg(u32 dst, u32 src)
    512 {
    513   if (src != dst)
    514     armAsm->mov(Register(dst), Register(src));
    515 }
    516 
    517 void CPU::NewRec::AArch32Compiler::AssertRegOrConstS(CompileFlags cf) const
    518 {
    519   DebugAssert(cf.valid_host_s || cf.const_s);
    520 }
    521 
    522 void CPU::NewRec::AArch32Compiler::AssertRegOrConstT(CompileFlags cf) const
    523 {
    524   DebugAssert(cf.valid_host_t || cf.const_t);
    525 }
    526 
    527 vixl::aarch32::MemOperand CPU::NewRec::AArch32Compiler::MipsPtr(Reg r) const
    528 {
    529   DebugAssert(r < Reg::count);
    530   return PTR(&g_state.regs.r[static_cast<u32>(r)]);
    531 }
    532 
    533 vixl::aarch32::Register CPU::NewRec::AArch32Compiler::CFGetRegD(CompileFlags cf) const
    534 {
    535   DebugAssert(cf.valid_host_d);
    536   return Register(cf.host_d);
    537 }
    538 
    539 vixl::aarch32::Register CPU::NewRec::AArch32Compiler::CFGetRegS(CompileFlags cf) const
    540 {
    541   DebugAssert(cf.valid_host_s);
    542   return Register(cf.host_s);
    543 }
    544 
    545 vixl::aarch32::Register CPU::NewRec::AArch32Compiler::CFGetRegT(CompileFlags cf) const
    546 {
    547   DebugAssert(cf.valid_host_t);
    548   return Register(cf.host_t);
    549 }
    550 
    551 vixl::aarch32::Register CPU::NewRec::AArch32Compiler::CFGetRegLO(CompileFlags cf) const
    552 {
    553   DebugAssert(cf.valid_host_lo);
    554   return Register(cf.host_lo);
    555 }
    556 
    557 vixl::aarch32::Register CPU::NewRec::AArch32Compiler::CFGetRegHI(CompileFlags cf) const
    558 {
    559   DebugAssert(cf.valid_host_hi);
    560   return Register(cf.host_hi);
    561 }
    562 
    563 vixl::aarch32::Register CPU::NewRec::AArch32Compiler::GetMembaseReg()
    564 {
    565   const u32 code = RMEMBASE.GetCode();
    566   if (!IsHostRegAllocated(code))
    567   {
    568     // Leave usable unset, so we don't try to allocate it later.
    569     m_host_regs[code].type = HR_TYPE_MEMBASE;
    570     m_host_regs[code].flags = HR_ALLOCATED;
    571     armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
    572   }
    573 
    574   return RMEMBASE;
    575 }
    576 
    577 void CPU::NewRec::AArch32Compiler::MoveSToReg(const vixl::aarch32::Register& dst, CompileFlags cf)
    578 {
    579   if (cf.valid_host_s)
    580   {
    581     if (cf.host_s != dst.GetCode())
    582       armAsm->mov(dst, Register(cf.host_s));
    583   }
    584   else if (cf.const_s)
    585   {
    586     const u32 cv = GetConstantRegU32(cf.MipsS());
    587     EmitMov(dst, cv);
    588   }
    589   else
    590   {
    591     WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
    592     armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
    593   }
    594 }
    595 
    596 void CPU::NewRec::AArch32Compiler::MoveTToReg(const vixl::aarch32::Register& dst, CompileFlags cf)
    597 {
    598   if (cf.valid_host_t)
    599   {
    600     if (cf.host_t != dst.GetCode())
    601       armAsm->mov(dst, Register(cf.host_t));
    602   }
    603   else if (cf.const_t)
    604   {
    605     const u32 cv = GetConstantRegU32(cf.MipsT());
    606     EmitMov(dst, cv);
    607   }
    608   else
    609   {
    610     WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
    611     armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
    612   }
    613 }
    614 
    615 void CPU::NewRec::AArch32Compiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg)
    616 {
    617   DebugAssert(reg < Reg::count);
    618   if (const std::optional<u32> hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg))
    619     armAsm->mov(dst, Register(hreg.value()));
    620   else if (HasConstantReg(reg))
    621     EmitMov(dst, GetConstantRegU32(reg));
    622   else
    623     armAsm->ldr(dst, MipsPtr(reg));
    624 }
    625 
    626 void CPU::NewRec::AArch32Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
    627                                                                 Reg arg2reg /* = Reg::count */,
    628                                                                 Reg arg3reg /* = Reg::count */)
    629 {
    630   DebugAssert(g_settings.gpu_pgxp_enable);
    631 
    632   Flush(FLUSH_FOR_C_CALL);
    633 
    634   if (arg2reg != Reg::count)
    635     MoveMIPSRegToReg(RARG2, arg2reg);
    636   if (arg3reg != Reg::count)
    637     MoveMIPSRegToReg(RARG3, arg3reg);
    638 
    639   EmitMov(RARG1, arg1val);
    640   EmitCall(func);
    641 }
    642 
    643 void CPU::NewRec::AArch32Compiler::Flush(u32 flags)
    644 {
    645   Compiler::Flush(flags);
    646 
    647   if (flags & FLUSH_PC && m_dirty_pc)
    648   {
    649     StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
    650     m_dirty_pc = false;
    651   }
    652 
    653   if (flags & FLUSH_INSTRUCTION_BITS)
    654   {
    655     // This sucks, but it's only used for fallbacks.
    656     EmitMov(RARG1, inst->bits);
    657     EmitMov(RARG2, m_current_instruction_pc);
    658     EmitMov(RARG3, m_current_instruction_branch_delay_slot);
    659     armAsm->str(RARG1, PTR(&g_state.current_instruction.bits));
    660     armAsm->str(RARG2, PTR(&g_state.current_instruction_pc));
    661     armAsm->strb(RARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));
    662   }
    663 
    664   if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
    665   {
    666     // This sucks :(
    667     // TODO: make it a function?
    668     armAsm->ldrb(RARG1, PTR(&g_state.load_delay_reg));
    669     armAsm->ldr(RARG2, PTR(&g_state.load_delay_value));
    670     EmitMov(RSCRATCH, OFFSETOF(CPU::State, regs.r[0]));
    671     armAsm->add(RARG1, RSCRATCH, vixl::aarch32::Operand(RARG1, LSL, 2));
    672     armAsm->str(RARG2, MemOperand(RSTATE, RARG1));
    673     EmitMov(RSCRATCH, static_cast<u8>(Reg::count));
    674     armAsm->strb(RSCRATCH, PTR(&g_state.load_delay_reg));
    675     m_load_delay_dirty = false;
    676   }
    677 
    678   if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
    679   {
    680     if (m_load_delay_value_register != NUM_HOST_REGS)
    681       FreeHostReg(m_load_delay_value_register);
    682 
    683     EmitMov(RSCRATCH, static_cast<u8>(m_load_delay_register));
    684     armAsm->strb(RSCRATCH, PTR(&g_state.load_delay_reg));
    685     m_load_delay_register = Reg::count;
    686     m_load_delay_dirty = true;
    687   }
    688 
    689   if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
    690   {
    691     // May as well flush cycles while we're here.
    692     // GTE spanning blocks is very rare, we _could_ disable this for speed.
    693     armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
    694     armAsm->ldr(RARG2, PTR(&g_state.gte_completion_tick));
    695     if (m_cycles > 0)
    696     {
    697       armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
    698       m_cycles = 0;
    699     }
    700     armAsm->cmp(RARG2, RARG1);
    701     armAsm->mov(hs, RARG1, RARG2);
    702     armAsm->str(RARG1, PTR(&g_state.pending_ticks));
    703     m_dirty_gte_done_cycle = false;
    704   }
    705 
    706   if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
    707   {
    708     armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
    709 
    710     // update cycles at the same time
    711     if (flags & FLUSH_CYCLES && m_cycles > 0)
    712     {
    713       armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
    714       armAsm->str(RARG1, PTR(&g_state.pending_ticks));
    715       m_gte_done_cycle -= m_cycles;
    716       m_cycles = 0;
    717     }
    718 
    719     armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_gte_done_cycle));
    720     armAsm->str(RARG1, PTR(&g_state.gte_completion_tick));
    721     m_gte_done_cycle = 0;
    722     m_dirty_gte_done_cycle = true;
    723   }
    724 
    725   if (flags & FLUSH_CYCLES && m_cycles > 0)
    726   {
    727     armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
    728     armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
    729     armAsm->str(RARG1, PTR(&g_state.pending_ticks));
    730     m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
    731     m_cycles = 0;
    732   }
    733 }
    734 
    735 void CPU::NewRec::AArch32Compiler::Compile_Fallback()
    736 {
    737   WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", iinfo->pc, inst->bits);
    738 
    739   Flush(FLUSH_FOR_INTERPRETER);
    740 
    741   EmitCall(reinterpret_cast<const void*>(&CPU::Recompiler::Thunks::InterpretInstruction));
    742 
    743   // TODO: make me less garbage
    744   // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
    745   // but nothing should be going through here..
    746   Label no_load_delay;
    747   armAsm->ldrb(RARG1, PTR(&g_state.next_load_delay_reg));
    748   armAsm->cmp(RARG1, static_cast<u8>(Reg::count));
    749   armAsm->b(eq, &no_load_delay);
    750   armAsm->ldr(RARG2, PTR(&g_state.next_load_delay_value));
    751   armAsm->strb(RARG1, PTR(&g_state.load_delay_reg));
    752   armAsm->str(RARG2, PTR(&g_state.load_delay_value));
    753   EmitMov(RARG1, static_cast<u32>(Reg::count));
    754   armAsm->strb(RARG1, PTR(&g_state.next_load_delay_reg));
    755   armAsm->bind(&no_load_delay);
    756 
    757   m_load_delay_dirty = EMULATE_LOAD_DELAYS;
    758 }
    759 
    760 void CPU::NewRec::AArch32Compiler::CheckBranchTarget(const vixl::aarch32::Register& pcreg)
    761 {
    762   if (!g_settings.cpu_recompiler_memory_exceptions)
    763     return;
    764 
    765   armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
    766   SwitchToFarCode(true, ne);
    767 
    768   BackupHostState();
    769   EndBlockWithException(Exception::AdEL);
    770 
    771   RestoreHostState();
    772   SwitchToNearCode(false);
    773 }
    774 
    775 void CPU::NewRec::AArch32Compiler::Compile_jr(CompileFlags cf)
    776 {
    777   const Register pcreg = CFGetRegS(cf);
    778   CheckBranchTarget(pcreg);
    779 
    780   armAsm->str(pcreg, PTR(&g_state.pc));
    781 
    782   CompileBranchDelaySlot(false);
    783   EndBlock(std::nullopt, true);
    784 }
    785 
    786 void CPU::NewRec::AArch32Compiler::Compile_jalr(CompileFlags cf)
    787 {
    788   const Register pcreg = CFGetRegS(cf);
    789   if (MipsD() != Reg::zero)
    790     SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
    791 
    792   CheckBranchTarget(pcreg);
    793   armAsm->str(pcreg, PTR(&g_state.pc));
    794 
    795   CompileBranchDelaySlot(false);
    796   EndBlock(std::nullopt, true);
    797 }
    798 
    799 void CPU::NewRec::AArch32Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
    800 {
    801   AssertRegOrConstS(cf);
    802 
    803   const u32 taken_pc = GetConditionalBranchTarget(cf);
    804 
    805   Flush(FLUSH_FOR_BRANCH);
    806 
    807   DebugAssert(cf.valid_host_s);
    808 
    809   // MipsT() here should equal zero for zero branches.
    810   DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
    811 
    812   Label taken;
    813   const Register rs = CFGetRegS(cf);
    814   switch (cond)
    815   {
    816     case BranchCondition::Equal:
    817     case BranchCondition::NotEqual:
    818     {
    819       AssertRegOrConstT(cf);
    820       if (cf.valid_host_t)
    821         armAsm->cmp(rs, CFGetRegT(cf));
    822       else if (cf.const_t)
    823         armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
    824 
    825       armAsm->b((cond == BranchCondition::Equal) ? eq : ne, &taken);
    826     }
    827     break;
    828 
    829     case BranchCondition::GreaterThanZero:
    830     {
    831       armAsm->cmp(rs, 0);
    832       armAsm->b(gt, &taken);
    833     }
    834     break;
    835 
    836     case BranchCondition::GreaterEqualZero:
    837     {
    838       armAsm->cmp(rs, 0);
    839       armAsm->b(ge, &taken);
    840     }
    841     break;
    842 
    843     case BranchCondition::LessThanZero:
    844     {
    845       armAsm->cmp(rs, 0);
    846       armAsm->b(lt, &taken);
    847     }
    848     break;
    849 
    850     case BranchCondition::LessEqualZero:
    851     {
    852       armAsm->cmp(rs, 0);
    853       armAsm->b(le, &taken);
    854     }
    855     break;
    856   }
    857 
    858   BackupHostState();
    859   if (!cf.delay_slot_swapped)
    860     CompileBranchDelaySlot();
    861 
    862   EndBlock(m_compiler_pc, true);
    863 
    864   armAsm->bind(&taken);
    865 
    866   RestoreHostState();
    867   if (!cf.delay_slot_swapped)
    868     CompileBranchDelaySlot();
    869 
    870   EndBlock(taken_pc, true);
    871 }
    872 
    873 void CPU::NewRec::AArch32Compiler::Compile_addi(CompileFlags cf, bool overflow)
    874 {
    875   const Register rs = CFGetRegS(cf);
    876   const Register rt = CFGetRegT(cf);
    877   if (const u32 imm = inst->i.imm_sext32(); imm != 0)
    878   {
    879     if (!overflow)
    880     {
    881       armAsm->add(rt, rs, armCheckAddSubConstant(imm));
    882     }
    883     else
    884     {
    885       armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
    886       TestOverflow(rt);
    887     }
    888   }
    889   else if (rt.GetCode() != rs.GetCode())
    890   {
    891     armAsm->mov(rt, rs);
    892   }
    893 }
    894 
    895 void CPU::NewRec::AArch32Compiler::Compile_addi(CompileFlags cf)
    896 {
    897   Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
    898 }
    899 
    900 void CPU::NewRec::AArch32Compiler::Compile_addiu(CompileFlags cf)
    901 {
    902   Compile_addi(cf, false);
    903 }
    904 
    905 void CPU::NewRec::AArch32Compiler::Compile_slti(CompileFlags cf)
    906 {
    907   Compile_slti(cf, true);
    908 }
    909 
    910 void CPU::NewRec::AArch32Compiler::Compile_sltiu(CompileFlags cf)
    911 {
    912   Compile_slti(cf, false);
    913 }
    914 
    915 void CPU::NewRec::AArch32Compiler::Compile_slti(CompileFlags cf, bool sign)
    916 {
    917   const Register rs = CFGetRegS(cf);
    918   const Register rt = CFGetRegT(cf);
    919   armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));
    920   armAsm->mov(sign ? ge : hs, rt, 0);
    921   armAsm->mov(sign ? lt : lo, rt, 1);
    922 }
    923 
    924 void CPU::NewRec::AArch32Compiler::Compile_andi(CompileFlags cf)
    925 {
    926   const Register rt = CFGetRegT(cf);
    927   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    928     armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
    929   else
    930     EmitMov(rt, 0);
    931 }
    932 
    933 void CPU::NewRec::AArch32Compiler::Compile_ori(CompileFlags cf)
    934 {
    935   const Register rt = CFGetRegT(cf);
    936   const Register rs = CFGetRegS(cf);
    937   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    938     armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
    939   else if (rt.GetCode() != rs.GetCode())
    940     armAsm->mov(rt, rs);
    941 }
    942 
    943 void CPU::NewRec::AArch32Compiler::Compile_xori(CompileFlags cf)
    944 {
    945   const Register rt = CFGetRegT(cf);
    946   const Register rs = CFGetRegS(cf);
    947   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    948     armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
    949   else if (rt.GetCode() != rs.GetCode())
    950     armAsm->mov(rt, rs);
    951 }
    952 
    953 void CPU::NewRec::AArch32Compiler::Compile_shift(CompileFlags cf,
    954                                                  void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
    955                                                                                       vixl::aarch32::Register,
    956                                                                                       const Operand&))
    957 {
    958   const Register rd = CFGetRegD(cf);
    959   const Register rt = CFGetRegT(cf);
    960   if (inst->r.shamt > 0)
    961     (armAsm->*op)(rd, rt, inst->r.shamt.GetValue());
    962   else if (rd.GetCode() != rt.GetCode())
    963     armAsm->mov(rd, rt);
    964 }
    965 
    966 void CPU::NewRec::AArch32Compiler::Compile_sll(CompileFlags cf)
    967 {
    968   Compile_shift(cf, &Assembler::lsl);
    969 }
    970 
    971 void CPU::NewRec::AArch32Compiler::Compile_srl(CompileFlags cf)
    972 {
    973   Compile_shift(cf, &Assembler::lsr);
    974 }
    975 
    976 void CPU::NewRec::AArch32Compiler::Compile_sra(CompileFlags cf)
    977 {
    978   Compile_shift(cf, &Assembler::asr);
    979 }
    980 
    981 void CPU::NewRec::AArch32Compiler::Compile_variable_shift(CompileFlags cf,
    982                                                           void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
    983                                                                                                vixl::aarch32::Register,
    984                                                                                                const Operand&))
    985 {
    986   const Register rd = CFGetRegD(cf);
    987 
    988   AssertRegOrConstS(cf);
    989   AssertRegOrConstT(cf);
    990 
    991   const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
    992   if (!cf.valid_host_t)
    993     MoveTToReg(rt, cf);
    994 
    995   if (cf.const_s)
    996   {
    997     if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
    998       (armAsm->*op)(rd, rt, shift);
    999     else if (rd.GetCode() != rt.GetCode())
   1000       armAsm->mov(rd, rt);
   1001   }
   1002   else
   1003   {
   1004     (armAsm->*op)(rd, rt, CFGetRegS(cf));
   1005   }
   1006 }
   1007 
   1008 void CPU::NewRec::AArch32Compiler::Compile_sllv(CompileFlags cf)
   1009 {
   1010   Compile_variable_shift(cf, &Assembler::lsl);
   1011 }
   1012 
   1013 void CPU::NewRec::AArch32Compiler::Compile_srlv(CompileFlags cf)
   1014 {
   1015   Compile_variable_shift(cf, &Assembler::lsr);
   1016 }
   1017 
   1018 void CPU::NewRec::AArch32Compiler::Compile_srav(CompileFlags cf)
   1019 {
   1020   Compile_variable_shift(cf, &Assembler::asr);
   1021 }
   1022 
   1023 void CPU::NewRec::AArch32Compiler::Compile_mult(CompileFlags cf, bool sign)
   1024 {
   1025   const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
   1026   if (!cf.valid_host_s)
   1027     MoveSToReg(rs, cf);
   1028 
   1029   const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   1030   if (!cf.valid_host_t)
   1031     MoveTToReg(rt, cf);
   1032 
   1033   // TODO: if lo/hi gets killed, we can use a 32-bit multiply
   1034   const Register lo = CFGetRegLO(cf);
   1035   const Register hi = CFGetRegHI(cf);
   1036 
   1037   (sign) ? armAsm->smull(lo, hi, rs, rt) : armAsm->umull(lo, hi, rs, rt);
   1038 }
   1039 
   1040 void CPU::NewRec::AArch32Compiler::Compile_mult(CompileFlags cf)
   1041 {
   1042   Compile_mult(cf, true);
   1043 }
   1044 
   1045 void CPU::NewRec::AArch32Compiler::Compile_multu(CompileFlags cf)
   1046 {
   1047   Compile_mult(cf, false);
   1048 }
   1049 
   1050 void CPU::NewRec::AArch32Compiler::Compile_div(CompileFlags cf)
   1051 {
   1052   const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
   1053   if (!cf.valid_host_s)
   1054     MoveSToReg(rs, cf);
   1055 
   1056   const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   1057   if (!cf.valid_host_t)
   1058     MoveTToReg(rt, cf);
   1059 
   1060   const Register rlo = CFGetRegLO(cf);
   1061   const Register rhi = CFGetRegHI(cf);
   1062 
   1063   // TODO: This could be slightly more optimal
   1064   Label done;
   1065   Label not_divide_by_zero;
   1066   armAsm->cmp(rt, 0);
   1067   armAsm->b(ne, &not_divide_by_zero);
   1068   armAsm->mov(rhi, rs); // hi = num
   1069   EmitMov(rlo, 1);
   1070   EmitMov(RSCRATCH, static_cast<u32>(-1));
   1071   armAsm->cmp(rs, 0);
   1072   armAsm->mov(ge, rlo, RSCRATCH); // lo = s >= 0 ? -1 : 1
   1073   armAsm->b(&done);
   1074 
   1075   armAsm->bind(&not_divide_by_zero);
   1076   Label not_unrepresentable;
   1077   armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));
   1078   armAsm->b(ne, &not_unrepresentable);
   1079   armAsm->cmp(rt, armCheckCompareConstant(-1));
   1080   armAsm->b(ne, &not_unrepresentable);
   1081 
   1082   EmitMov(rlo, 0x80000000u);
   1083   EmitMov(rhi, 0);
   1084   armAsm->b(&done);
   1085 
   1086   armAsm->bind(&not_unrepresentable);
   1087 
   1088   armAsm->sdiv(rlo, rs, rt);
   1089 
   1090   // TODO: skip when hi is dead
   1091   armAsm->mls(rhi, rlo, rt, rs);
   1092 
   1093   armAsm->bind(&done);
   1094 }
   1095 
   1096 void CPU::NewRec::AArch32Compiler::Compile_divu(CompileFlags cf)
   1097 {
   1098   const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
   1099   if (!cf.valid_host_s)
   1100     MoveSToReg(rs, cf);
   1101 
   1102   const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   1103   if (!cf.valid_host_t)
   1104     MoveTToReg(rt, cf);
   1105 
   1106   const Register rlo = CFGetRegLO(cf);
   1107   const Register rhi = CFGetRegHI(cf);
   1108 
   1109   Label done;
   1110   Label not_divide_by_zero;
   1111   armAsm->cmp(rt, 0);
   1112   armAsm->b(ne, &not_divide_by_zero);
   1113   EmitMov(rlo, static_cast<u32>(-1));
   1114   armAsm->mov(rhi, rs);
   1115   armAsm->b(&done);
   1116 
   1117   armAsm->bind(&not_divide_by_zero);
   1118 
   1119   armAsm->udiv(rlo, rs, rt);
   1120 
   1121   // TODO: skip when hi is dead
   1122   armAsm->mls(rhi, rlo, rt, rs);
   1123 
   1124   armAsm->bind(&done);
   1125 }
   1126 
   1127 void CPU::NewRec::AArch32Compiler::TestOverflow(const vixl::aarch32::Register& result)
   1128 {
   1129   SwitchToFarCode(true, vs);
   1130 
   1131   BackupHostState();
   1132 
   1133   // toss the result
   1134   ClearHostReg(result.GetCode());
   1135 
   1136   EndBlockWithException(Exception::Ov);
   1137 
   1138   RestoreHostState();
   1139 
   1140   SwitchToNearCode(false);
   1141 }
   1142 
   1143 void CPU::NewRec::AArch32Compiler::Compile_dst_op(CompileFlags cf,
   1144                                                   void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
   1145                                                                                        vixl::aarch32::Register,
   1146                                                                                        const Operand&),
   1147                                                   bool commutative, bool logical, bool overflow)
   1148 {
   1149   AssertRegOrConstS(cf);
   1150   AssertRegOrConstT(cf);
   1151 
   1152   const Register rd = CFGetRegD(cf);
   1153   if (cf.valid_host_s && cf.valid_host_t)
   1154   {
   1155     (armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
   1156   }
   1157   else if (commutative && (cf.const_s || cf.const_t))
   1158   {
   1159     const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
   1160     if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
   1161     {
   1162       (armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
   1163     }
   1164     else
   1165     {
   1166       if (rd.GetCode() != src.GetCode())
   1167         armAsm->mov(rd, src);
   1168       overflow = false;
   1169     }
   1170   }
   1171   else if (cf.const_s)
   1172   {
   1173     EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS()));
   1174     (armAsm->*op)(rd, RSCRATCH, CFGetRegT(cf));
   1175   }
   1176   else if (cf.const_t)
   1177   {
   1178     const Register rs = CFGetRegS(cf);
   1179     if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
   1180     {
   1181       (armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
   1182     }
   1183     else
   1184     {
   1185       if (rd.GetCode() != rs.GetCode())
   1186         armAsm->mov(rd, rs);
   1187       overflow = false;
   1188     }
   1189   }
   1190 
   1191   if (overflow)
   1192     TestOverflow(rd);
   1193 }
   1194 
   1195 void CPU::NewRec::AArch32Compiler::Compile_add(CompileFlags cf)
   1196 {
   1197   if (g_settings.cpu_recompiler_memory_exceptions)
   1198     Compile_dst_op(cf, &Assembler::adds, true, false, true);
   1199   else
   1200     Compile_dst_op(cf, &Assembler::add, true, false, false);
   1201 }
   1202 
   1203 void CPU::NewRec::AArch32Compiler::Compile_addu(CompileFlags cf)
   1204 {
   1205   Compile_dst_op(cf, &Assembler::add, true, false, false);
   1206 }
   1207 
   1208 void CPU::NewRec::AArch32Compiler::Compile_sub(CompileFlags cf)
   1209 {
   1210   if (g_settings.cpu_recompiler_memory_exceptions)
   1211     Compile_dst_op(cf, &Assembler::subs, false, false, true);
   1212   else
   1213     Compile_dst_op(cf, &Assembler::sub, false, false, false);
   1214 }
   1215 
   1216 void CPU::NewRec::AArch32Compiler::Compile_subu(CompileFlags cf)
   1217 {
   1218   Compile_dst_op(cf, &Assembler::sub, false, false, false);
   1219 }
   1220 
   1221 void CPU::NewRec::AArch32Compiler::Compile_and(CompileFlags cf)
   1222 {
   1223   AssertRegOrConstS(cf);
   1224   AssertRegOrConstT(cf);
   1225 
   1226   // special cases - and with self -> self, and with 0 -> 0
   1227   const Register regd = CFGetRegD(cf);
   1228   if (cf.MipsS() == cf.MipsT())
   1229   {
   1230     armAsm->mov(regd, CFGetRegS(cf));
   1231     return;
   1232   }
   1233   else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
   1234   {
   1235     EmitMov(regd, 0);
   1236     return;
   1237   }
   1238 
   1239   Compile_dst_op(cf, &Assembler::and_, true, true, false);
   1240 }
   1241 
   1242 void CPU::NewRec::AArch32Compiler::Compile_or(CompileFlags cf)
   1243 {
   1244   AssertRegOrConstS(cf);
   1245   AssertRegOrConstT(cf);
   1246 
   1247   // or/nor with 0 -> no effect
   1248   const Register regd = CFGetRegD(cf);
   1249   if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
   1250   {
   1251     cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
   1252     return;
   1253   }
   1254 
   1255   Compile_dst_op(cf, &Assembler::orr, true, true, false);
   1256 }
   1257 
   1258 void CPU::NewRec::AArch32Compiler::Compile_xor(CompileFlags cf)
   1259 {
   1260   AssertRegOrConstS(cf);
   1261   AssertRegOrConstT(cf);
   1262 
   1263   const Register regd = CFGetRegD(cf);
   1264   if (cf.MipsS() == cf.MipsT())
   1265   {
   1266     // xor with self -> zero
   1267     EmitMov(regd, 0);
   1268     return;
   1269   }
   1270   else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
   1271   {
   1272     // xor with zero -> no effect
   1273     cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
   1274     return;
   1275   }
   1276 
   1277   Compile_dst_op(cf, &Assembler::eor, true, true, false);
   1278 }
   1279 
   1280 void CPU::NewRec::AArch32Compiler::Compile_nor(CompileFlags cf)
   1281 {
   1282   Compile_or(cf);
   1283   armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
   1284 }
   1285 
   1286 void CPU::NewRec::AArch32Compiler::Compile_slt(CompileFlags cf)
   1287 {
   1288   Compile_slt(cf, true);
   1289 }
   1290 
   1291 void CPU::NewRec::AArch32Compiler::Compile_sltu(CompileFlags cf)
   1292 {
   1293   Compile_slt(cf, false);
   1294 }
   1295 
   1296 void CPU::NewRec::AArch32Compiler::Compile_slt(CompileFlags cf, bool sign)
   1297 {
   1298   AssertRegOrConstS(cf);
   1299   AssertRegOrConstT(cf);
   1300 
   1301   // TODO: swap and reverse op for constants
   1302   if (cf.const_s)
   1303   {
   1304     EmitMov(RSCRATCH, GetConstantRegS32(cf.MipsS()));
   1305     armAsm->cmp(RSCRATCH, CFGetRegT(cf));
   1306   }
   1307   else if (cf.const_t)
   1308   {
   1309     armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
   1310   }
   1311   else
   1312   {
   1313     armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
   1314   }
   1315 
   1316   const Register rd = CFGetRegD(cf);
   1317   armAsm->mov(sign ? ge : cs, rd, 0);
   1318   armAsm->mov(sign ? lt : lo, rd, 1);
   1319 }
   1320 
   1321 vixl::aarch32::Register
   1322 CPU::NewRec::AArch32Compiler::ComputeLoadStoreAddressArg(CompileFlags cf,
   1323                                                          const std::optional<VirtualMemoryAddress>& address,
   1324                                                          const std::optional<const vixl::aarch32::Register>& reg)
   1325 {
   1326   const u32 imm = inst->i.imm_sext32();
   1327   if (cf.valid_host_s && imm == 0 && !reg.has_value())
   1328     return CFGetRegS(cf);
   1329 
   1330   const Register dst = reg.has_value() ? reg.value() : RARG1;
   1331   if (address.has_value())
   1332   {
   1333     EmitMov(dst, address.value());
   1334   }
   1335   else if (imm == 0)
   1336   {
   1337     if (cf.valid_host_s)
   1338     {
   1339       if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
   1340         armAsm->mov(dst, CFGetRegS(cf));
   1341     }
   1342     else
   1343     {
   1344       armAsm->ldr(dst, MipsPtr(cf.MipsS()));
   1345     }
   1346   }
   1347   else
   1348   {
   1349     if (cf.valid_host_s)
   1350     {
   1351       armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
   1352     }
   1353     else
   1354     {
   1355       armAsm->ldr(dst, MipsPtr(cf.MipsS()));
   1356       armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
   1357     }
   1358   }
   1359 
   1360   return dst;
   1361 }
   1362 
   1363 template<typename RegAllocFn>
   1364 vixl::aarch32::Register CPU::NewRec::AArch32Compiler::GenerateLoad(const vixl::aarch32::Register& addr_reg,
   1365                                                                    MemoryAccessSize size, bool sign, bool use_fastmem,
   1366                                                                    const RegAllocFn& dst_reg_alloc)
   1367 {
   1368   if (use_fastmem)
   1369   {
   1370     DebugAssert(g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT);
   1371     m_cycles += Bus::RAM_READ_TICKS;
   1372 
   1373     const Register dst = dst_reg_alloc();
   1374     const Register membase = GetMembaseReg();
   1375     DebugAssert(addr_reg.GetCode() != RARG3.GetCode());
   1376     armAsm->lsr(RARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
   1377     armAsm->ldr(RARG3, MemOperand(membase, RARG3, LSL, 2));
   1378 
   1379     const MemOperand mem = MemOperand(RARG3, addr_reg);
   1380     u8* start = armAsm->GetCursorAddress<u8*>();
   1381     switch (size)
   1382     {
   1383       case MemoryAccessSize::Byte:
   1384         sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
   1385         break;
   1386 
   1387       case MemoryAccessSize::HalfWord:
   1388         sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
   1389         break;
   1390 
   1391       case MemoryAccessSize::Word:
   1392         armAsm->ldr(dst, mem);
   1393         break;
   1394     }
   1395 
   1396     AddLoadStoreInfo(start, kA32InstructionSizeInBytes, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
   1397     return dst;
   1398   }
   1399 
   1400   if (addr_reg.GetCode() != RARG1.GetCode())
   1401     armAsm->mov(RARG1, addr_reg);
   1402 
   1403   const bool checked = g_settings.cpu_recompiler_memory_exceptions;
   1404   switch (size)
   1405   {
   1406     case MemoryAccessSize::Byte:
   1407     {
   1408       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryByte) :
   1409                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte));
   1410     }
   1411     break;
   1412     case MemoryAccessSize::HalfWord:
   1413     {
   1414       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryHalfWord) :
   1415                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord));
   1416     }
   1417     break;
   1418     case MemoryAccessSize::Word:
   1419     {
   1420       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryWord) :
   1421                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord));
   1422     }
   1423     break;
   1424   }
   1425 
   1426   // TODO: turn this into an asm function instead
   1427   if (checked)
   1428   {
   1429     SwitchToFarCodeIfBitSet(RRETHI, 31);
   1430     BackupHostState();
   1431 
   1432     // Need to stash this in a temp because of the flush.
   1433     const Register temp = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
   1434     armAsm->rsb(temp, RRETHI, 0);
   1435     armAsm->lsl(temp, temp, 2);
   1436 
   1437     Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
   1438 
   1439     // cause_bits = (-result << 2) | BD | cop_n
   1440     armAsm->orr(RARG1, temp,
   1441                 armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
   1442                   static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
   1443     EmitMov(RARG2, m_current_instruction_pc);
   1444     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   1445     FreeHostReg(temp.GetCode());
   1446     EndBlock(std::nullopt, true);
   1447 
   1448     RestoreHostState();
   1449     SwitchToNearCode(false);
   1450   }
   1451 
   1452   const Register dst_reg = dst_reg_alloc();
   1453   switch (size)
   1454   {
   1455     case MemoryAccessSize::Byte:
   1456     {
   1457       sign ? armAsm->sxtb(dst_reg, RRET) : armAsm->uxtb(dst_reg, RRET);
   1458     }
   1459     break;
   1460     case MemoryAccessSize::HalfWord:
   1461     {
   1462       sign ? armAsm->sxth(dst_reg, RRET) : armAsm->uxth(dst_reg, RRET);
   1463     }
   1464     break;
   1465     case MemoryAccessSize::Word:
   1466     {
   1467       if (dst_reg.GetCode() != RRET.GetCode())
   1468         armAsm->mov(dst_reg, RRET);
   1469     }
   1470     break;
   1471   }
   1472 
   1473   return dst_reg;
   1474 }
   1475 
   1476 void CPU::NewRec::AArch32Compiler::GenerateStore(const vixl::aarch32::Register& addr_reg,
   1477                                                  const vixl::aarch32::Register& value_reg, MemoryAccessSize size,
   1478                                                  bool use_fastmem)
   1479 {
   1480   if (use_fastmem)
   1481   {
   1482     DebugAssert(g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT);
   1483     DebugAssert(addr_reg.GetCode() != RARG3.GetCode());
   1484     const Register membase = GetMembaseReg();
   1485     armAsm->lsr(RARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
   1486     armAsm->ldr(RARG3, MemOperand(membase, RARG3, LSL, 2));
   1487 
   1488     const MemOperand mem = MemOperand(RARG3, addr_reg);
   1489     u8* start = armAsm->GetCursorAddress<u8*>();
   1490     switch (size)
   1491     {
   1492       case MemoryAccessSize::Byte:
   1493         armAsm->strb(value_reg, mem);
   1494         break;
   1495 
   1496       case MemoryAccessSize::HalfWord:
   1497         armAsm->strh(value_reg, mem);
   1498         break;
   1499 
   1500       case MemoryAccessSize::Word:
   1501         armAsm->str(value_reg, mem);
   1502         break;
   1503     }
   1504     AddLoadStoreInfo(start, kA32InstructionSizeInBytes, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
   1505     return;
   1506   }
   1507 
   1508   if (addr_reg.GetCode() != RARG1.GetCode())
   1509     armAsm->mov(RARG1, addr_reg);
   1510   if (value_reg.GetCode() != RARG2.GetCode())
   1511     armAsm->mov(RARG2, value_reg);
   1512 
   1513   const bool checked = g_settings.cpu_recompiler_memory_exceptions;
   1514   switch (size)
   1515   {
   1516     case MemoryAccessSize::Byte:
   1517     {
   1518       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryByte) :
   1519                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
   1520     }
   1521     break;
   1522     case MemoryAccessSize::HalfWord:
   1523     {
   1524       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryHalfWord) :
   1525                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
   1526     }
   1527     break;
   1528     case MemoryAccessSize::Word:
   1529     {
   1530       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryWord) :
   1531                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
   1532     }
   1533     break;
   1534   }
   1535 
   1536   // TODO: turn this into an asm function instead
   1537   if (checked)
   1538   {
   1539     SwitchToFarCodeIfRegZeroOrNonZero(RRET, true);
   1540     BackupHostState();
   1541 
   1542     // Need to stash this in a temp because of the flush.
   1543     const Register temp = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
   1544     armAsm->lsl(temp, RRET, 2);
   1545 
   1546     Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
   1547 
   1548     // cause_bits = (result << 2) | BD | cop_n
   1549     armAsm->orr(RARG1, temp,
   1550                 armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
   1551                   static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
   1552     EmitMov(RARG2, m_current_instruction_pc);
   1553     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   1554     FreeHostReg(temp.GetCode());
   1555     EndBlock(std::nullopt, true);
   1556 
   1557     RestoreHostState();
   1558     SwitchToNearCode(false);
   1559   }
   1560 }
   1561 
   1562 void CPU::NewRec::AArch32Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1563                                                const std::optional<VirtualMemoryAddress>& address)
   1564 {
   1565   const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
   1566                                              std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1567                                              std::optional<Register>();
   1568   FlushForLoadStore(address, false, use_fastmem);
   1569   const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1570   const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {
   1571     if (cf.MipsT() == Reg::zero)
   1572       return RRET;
   1573 
   1574     return Register(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   1575                                     EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT()));
   1576   });
   1577 
   1578   if (g_settings.gpu_pgxp_enable)
   1579   {
   1580     Flush(FLUSH_FOR_C_CALL);
   1581 
   1582     EmitMov(RARG1, inst->bits);
   1583     armAsm->mov(RARG2, addr);
   1584     armAsm->mov(RARG3, data);
   1585     EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
   1586     FreeHostReg(addr_reg.value().GetCode());
   1587   }
   1588 }
   1589 
   1590 void CPU::NewRec::AArch32Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1591                                                const std::optional<VirtualMemoryAddress>& address)
   1592 {
   1593   DebugAssert(size == MemoryAccessSize::Word && !sign);
   1594 
   1595   const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
   1596   FlushForLoadStore(address, false, use_fastmem);
   1597 
   1598   // TODO: if address is constant, this can be simplified..
   1599 
   1600   // If we're coming from another block, just flush the load delay and hope for the best..
   1601   if (m_load_delay_dirty)
   1602     UpdateLoadDelay();
   1603 
   1604   // We'd need to be careful here if we weren't overwriting it..
   1605   ComputeLoadStoreAddressArg(cf, address, addr);
   1606   armAsm->bic(RARG1, addr, 3);
   1607   GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
   1608 
   1609   if (inst->r.rt == Reg::zero)
   1610   {
   1611     FreeHostReg(addr.GetCode());
   1612     return;
   1613   }
   1614 
   1615   // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
   1616   // never written back. NOTE: can't trust T in cf because of the flush
   1617   const Reg rt = inst->r.rt;
   1618   Register value;
   1619   if (m_load_delay_register == rt)
   1620   {
   1621     const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
   1622                                  AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
   1623                                  m_load_delay_value_register;
   1624     RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
   1625     value = Register(existing_ld_rt);
   1626   }
   1627   else
   1628   {
   1629     if constexpr (EMULATE_LOAD_DELAYS)
   1630     {
   1631       value = Register(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
   1632       if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
   1633         armAsm->mov(value, Register(rtreg.value()));
   1634       else if (HasConstantReg(rt))
   1635         EmitMov(value, GetConstantRegU32(rt));
   1636       else
   1637         armAsm->ldr(value, MipsPtr(rt));
   1638     }
   1639     else
   1640     {
   1641       value = Register(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
   1642     }
   1643   }
   1644 
   1645   DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode());
   1646   armAsm->and_(RARG2, addr, 3);
   1647   armAsm->lsl(RARG2, RARG2, 3); // *8
   1648   EmitMov(RARG3, 24);
   1649   armAsm->sub(RARG3, RARG3, RARG2);
   1650 
   1651   if (inst->op == InstructionOp::lwl)
   1652   {
   1653     // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
   1654     // new_value = (value & mask) | (RWRET << (24 - shift));
   1655     EmitMov(RSCRATCH, 0xFFFFFFu);
   1656     armAsm->lsr(RSCRATCH, RSCRATCH, RARG2);
   1657     armAsm->and_(value, value, RSCRATCH);
   1658     armAsm->lsl(RRET, RRET, RARG3);
   1659     armAsm->orr(value, value, RRET);
   1660   }
   1661   else
   1662   {
   1663     // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
   1664     // new_value = (value & mask) | (RWRET >> shift);
   1665     armAsm->lsr(RRET, RRET, RARG2);
   1666     EmitMov(RSCRATCH, 0xFFFFFF00u);
   1667     armAsm->lsl(RSCRATCH, RSCRATCH, RARG3);
   1668     armAsm->and_(value, value, RSCRATCH);
   1669     armAsm->orr(value, value, RRET);
   1670   }
   1671 
   1672   FreeHostReg(addr.GetCode());
   1673 
   1674   if (g_settings.gpu_pgxp_enable)
   1675   {
   1676     Flush(FLUSH_FOR_C_CALL);
   1677     armAsm->mov(RARG3, value);
   1678     armAsm->bic(RARG2, addr, 3);
   1679     EmitMov(RARG1, inst->bits);
   1680     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
   1681   }
   1682 }
   1683 
   1684 void CPU::NewRec::AArch32Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1685                                                 const std::optional<VirtualMemoryAddress>& address)
   1686 {
   1687   const u32 index = static_cast<u32>(inst->r.rt.GetValue());
   1688   const auto [ptr, action] = GetGTERegisterPointer(index, true);
   1689   const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
   1690                                              std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1691                                              std::optional<Register>();
   1692   FlushForLoadStore(address, false, use_fastmem);
   1693   const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1694   const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
   1695     return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
   1696              Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :
   1697              RRET;
   1698   });
   1699 
   1700   switch (action)
   1701   {
   1702     case GTERegisterAccessAction::Ignore:
   1703     {
   1704       break;
   1705     }
   1706 
   1707     case GTERegisterAccessAction::Direct:
   1708     {
   1709       armAsm->str(value, PTR(ptr));
   1710       break;
   1711     }
   1712 
   1713     case GTERegisterAccessAction::SignExtend16:
   1714     {
   1715       armAsm->sxth(RARG3, value);
   1716       armAsm->str(RARG3, PTR(ptr));
   1717       break;
   1718     }
   1719 
   1720     case GTERegisterAccessAction::ZeroExtend16:
   1721     {
   1722       armAsm->uxth(RARG3, value);
   1723       armAsm->str(RARG3, PTR(ptr));
   1724       break;
   1725     }
   1726 
   1727     case GTERegisterAccessAction::CallHandler:
   1728     {
   1729       Flush(FLUSH_FOR_C_CALL);
   1730       armAsm->mov(RARG2, value);
   1731       EmitMov(RARG1, index);
   1732       EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
   1733       break;
   1734     }
   1735 
   1736     case GTERegisterAccessAction::PushFIFO:
   1737     {
   1738       // SXY0 <- SXY1
   1739       // SXY1 <- SXY2
   1740       // SXY2 <- SXYP
   1741       DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode());
   1742       armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
   1743       armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
   1744       armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
   1745       armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
   1746       armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));
   1747       break;
   1748     }
   1749 
   1750     default:
   1751     {
   1752       Panic("Unknown action");
   1753       return;
   1754     }
   1755   }
   1756 
   1757   if (g_settings.gpu_pgxp_enable)
   1758   {
   1759     Flush(FLUSH_FOR_C_CALL);
   1760     armAsm->mov(RARG3, value);
   1761     if (value.GetCode() != RRET.GetCode())
   1762       FreeHostReg(value.GetCode());
   1763     armAsm->mov(RARG2, addr);
   1764     FreeHostReg(addr_reg.value().GetCode());
   1765     EmitMov(RARG1, inst->bits);
   1766     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
   1767   }
   1768 }
   1769 
   1770 void CPU::NewRec::AArch32Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1771                                                const std::optional<VirtualMemoryAddress>& address)
   1772 {
   1773   AssertRegOrConstS(cf);
   1774   AssertRegOrConstT(cf);
   1775 
   1776   const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
   1777                                              std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1778                                              std::optional<Register>();
   1779   FlushForLoadStore(address, true, use_fastmem);
   1780   const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1781   const Register data = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   1782   if (!cf.valid_host_t)
   1783     MoveTToReg(RARG2, cf);
   1784 
   1785   GenerateStore(addr, data, size, use_fastmem);
   1786 
   1787   if (g_settings.gpu_pgxp_enable)
   1788   {
   1789     Flush(FLUSH_FOR_C_CALL);
   1790     MoveMIPSRegToReg(RARG3, cf.MipsT());
   1791     armAsm->mov(RARG2, addr);
   1792     EmitMov(RARG1, inst->bits);
   1793     EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
   1794     FreeHostReg(addr_reg.value().GetCode());
   1795   }
   1796 }
   1797 
   1798 void CPU::NewRec::AArch32Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1799                                                const std::optional<VirtualMemoryAddress>& address)
   1800 {
   1801   DebugAssert(size == MemoryAccessSize::Word && !sign);
   1802 
   1803   // TODO: this can take over rt's value if it's no longer needed
   1804   // NOTE: can't trust T in cf because of the alloc
   1805   const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
   1806   const Register value = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
   1807   if (g_settings.gpu_pgxp_enable)
   1808     MoveMIPSRegToReg(value, inst->r.rt);
   1809 
   1810   FlushForLoadStore(address, true, use_fastmem);
   1811 
   1812   // TODO: if address is constant, this can be simplified..
   1813   // We'd need to be careful here if we weren't overwriting it..
   1814   ComputeLoadStoreAddressArg(cf, address, addr);
   1815   armAsm->bic(RARG1, addr, 3);
   1816   GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
   1817 
   1818   armAsm->and_(RSCRATCH, addr, 3);
   1819   armAsm->lsl(RSCRATCH, RSCRATCH, 3); // *8
   1820   armAsm->bic(addr, addr, 3);
   1821 
   1822   // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
   1823   if (!g_settings.gpu_pgxp_enable)
   1824     MoveMIPSRegToReg(value, inst->r.rt);
   1825 
   1826   if (inst->op == InstructionOp::swl)
   1827   {
   1828     // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
   1829     // new_value = (RWRET & mem_mask) | (value >> (24 - shift));
   1830     EmitMov(RARG3, 0xFFFFFF00u);
   1831     armAsm->lsl(RARG3, RARG3, RSCRATCH);
   1832     armAsm->and_(RRET, RRET, RARG3);
   1833 
   1834     EmitMov(RARG3, 24);
   1835     armAsm->sub(RARG3, RARG3, RSCRATCH);
   1836     armAsm->lsr(value, value, RARG3);
   1837     armAsm->orr(value, value, RRET);
   1838   }
   1839   else
   1840   {
   1841     // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
   1842     // new_value = (RWRET & mem_mask) | (value << shift);
   1843     armAsm->lsl(value, value, RSCRATCH);
   1844 
   1845     EmitMov(RARG3, 24);
   1846     armAsm->sub(RARG3, RARG3, RSCRATCH);
   1847     EmitMov(RSCRATCH, 0x00FFFFFFu);
   1848     armAsm->lsr(RSCRATCH, RSCRATCH, RARG3);
   1849     armAsm->and_(RRET, RRET, RSCRATCH);
   1850     armAsm->orr(value, value, RRET);
   1851   }
   1852 
   1853   if (!g_settings.gpu_pgxp_enable)
   1854   {
   1855     GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
   1856     FreeHostReg(addr.GetCode());
   1857   }
   1858   else
   1859   {
   1860     GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
   1861 
   1862     Flush(FLUSH_FOR_C_CALL);
   1863     armAsm->mov(RARG3, value);
   1864     FreeHostReg(value.GetCode());
   1865     armAsm->mov(RARG2, addr);
   1866     FreeHostReg(addr.GetCode());
   1867     EmitMov(RARG1, inst->bits);
   1868     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
   1869   }
   1870 }
   1871 
   1872 void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1873                                                 const std::optional<VirtualMemoryAddress>& address)
   1874 {
   1875   const u32 index = static_cast<u32>(inst->r.rt.GetValue());
   1876   const auto [ptr, action] = GetGTERegisterPointer(index, false);
   1877   const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
   1878                           Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :
   1879                           RARG1;
   1880   const Register data = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
   1881   FlushForLoadStore(address, true, use_fastmem);
   1882   ComputeLoadStoreAddressArg(cf, address, addr);
   1883 
   1884   switch (action)
   1885   {
   1886     case GTERegisterAccessAction::Direct:
   1887     {
   1888       armAsm->ldr(data, PTR(ptr));
   1889     }
   1890     break;
   1891 
   1892     case GTERegisterAccessAction::CallHandler:
   1893     {
   1894       // should already be flushed.. except in fastmem case
   1895       Flush(FLUSH_FOR_C_CALL);
   1896       EmitMov(RARG1, index);
   1897       EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
   1898       armAsm->mov(data, RRET);
   1899     }
   1900     break;
   1901 
   1902     default:
   1903     {
   1904       Panic("Unknown action");
   1905     }
   1906     break;
   1907   }
   1908 
   1909   GenerateStore(addr, data, size, use_fastmem);
   1910   if (!g_settings.gpu_pgxp_enable)
   1911   {
   1912     if (addr.GetCode() != RARG1.GetCode())
   1913       FreeHostReg(addr.GetCode());
   1914   }
   1915   else
   1916   {
   1917     // TODO: This can be simplified because we don't need to validate in PGXP..
   1918     Flush(FLUSH_FOR_C_CALL);
   1919     armAsm->mov(RARG3, data);
   1920     FreeHostReg(data.GetCode());
   1921     armAsm->mov(RARG2, addr);
   1922     FreeHostReg(addr.GetCode());
   1923     EmitMov(RARG1, inst->bits);
   1924     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
   1925   }
   1926 }
   1927 
   1928 void CPU::NewRec::AArch32Compiler::Compile_mtc0(CompileFlags cf)
   1929 {
   1930   // TODO: we need better constant setting here.. which will need backprop
   1931   AssertRegOrConstT(cf);
   1932 
   1933   const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
   1934   const u32* ptr = GetCop0RegPtr(reg);
   1935   const u32 mask = GetCop0RegWriteMask(reg);
   1936   if (!ptr)
   1937   {
   1938     Compile_Fallback();
   1939     return;
   1940   }
   1941 
   1942   if (mask == 0)
   1943   {
   1944     // if it's a read-only register, ignore
   1945     DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
   1946     return;
   1947   }
   1948 
   1949   // for some registers, we need to test certain bits
   1950   const bool needs_bit_test = (reg == Cop0Reg::SR);
   1951   const Register new_value = RARG1;
   1952   const Register old_value = RARG2;
   1953   const Register changed_bits = RARG3;
   1954   const Register mask_reg = RSCRATCH;
   1955 
   1956   // Load old value
   1957   armAsm->ldr(old_value, PTR(ptr));
   1958 
   1959   // No way we fit this in an immediate..
   1960   EmitMov(mask_reg, mask);
   1961 
   1962   // update value
   1963   if (cf.valid_host_t)
   1964     armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
   1965   else
   1966     EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
   1967 
   1968   if (needs_bit_test)
   1969     armAsm->eor(changed_bits, old_value, new_value);
   1970   armAsm->bic(old_value, old_value, mask_reg);
   1971   armAsm->orr(new_value, old_value, new_value);
   1972   armAsm->str(new_value, PTR(ptr));
   1973 
   1974   if (reg == Cop0Reg::SR)
   1975   {
   1976     // TODO: replace with register backup
   1977     // We could just inline the whole thing..
   1978     Flush(FLUSH_FOR_C_CALL);
   1979 
   1980     SwitchToFarCodeIfBitSet(changed_bits, 16);
   1981     armAsm->push(RegisterList(RARG1));
   1982     EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
   1983     armAsm->pop(RegisterList(RARG1));
   1984     if (CodeCache::IsUsingFastmem() && m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions) &&
   1985         IsHostRegAllocated(RMEMBASE.GetCode()))
   1986     {
   1987       FreeHostReg(RMEMBASE.GetCode());
   1988     }
   1989     SwitchToNearCode(true);
   1990 
   1991     TestInterrupts(RARG1);
   1992   }
   1993   else if (reg == Cop0Reg::CAUSE)
   1994   {
   1995     armAsm->ldr(RARG1, PTR(&g_state.cop0_regs.sr.bits));
   1996     TestInterrupts(RARG1);
   1997   }
   1998 
   1999   if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions)
   2000   {
   2001     // TODO: DCIC handling for debug breakpoints
   2002     WARNING_LOG("TODO: DCIC handling for debug breakpoints");
   2003   }
   2004 }
   2005 
   2006 void CPU::NewRec::AArch32Compiler::Compile_rfe(CompileFlags cf)
   2007 {
   2008   // shift mode bits right two, preserving upper bits
   2009   armAsm->ldr(RARG1, PTR(&g_state.cop0_regs.sr.bits));
   2010   armAsm->bic(RARG2, RARG1, 15);
   2011   armAsm->ubfx(RARG1, RARG1, 2, 4);
   2012   armAsm->orr(RARG1, RARG1, RARG2);
   2013   armAsm->str(RARG1, PTR(&g_state.cop0_regs.sr.bits));
   2014 
   2015   TestInterrupts(RARG1);
   2016 }
   2017 
   2018 void CPU::NewRec::AArch32Compiler::TestInterrupts(const vixl::aarch32::Register& sr)
   2019 {
   2020   // if Iec == 0 then goto no_interrupt
   2021   Label no_interrupt;
   2022   armAsm->tst(sr, 1);
   2023   armAsm->b(eq, &no_interrupt);
   2024 
   2025   // sr & cause
   2026   armAsm->ldr(RSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
   2027   armAsm->and_(sr, sr, RSCRATCH);
   2028 
   2029   // ((sr & cause) & 0xff00) == 0 goto no_interrupt
   2030   armAsm->tst(sr, 0xFF00);
   2031 
   2032   SwitchToFarCode(true, ne);
   2033   BackupHostState();
   2034 
   2035   // Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
   2036   UpdateLoadDelay();
   2037 
   2038   Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
   2039 
   2040   // Can't use EndBlockWithException() here, because it'll use the wrong PC.
   2041   // Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
   2042   if (!iinfo->is_last_instruction)
   2043   {
   2044     EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
   2045                                                                (inst + 1)->cop.cop_n));
   2046     EmitMov(RARG2, m_compiler_pc);
   2047     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   2048     m_dirty_pc = false;
   2049     EndAndLinkBlock(std::nullopt, true, false);
   2050   }
   2051   else
   2052   {
   2053     EmitMov(RARG1, 0);
   2054     if (m_dirty_pc)
   2055       EmitMov(RARG2, m_compiler_pc);
   2056     armAsm->str(RARG1, PTR(&g_state.downcount));
   2057     if (m_dirty_pc)
   2058       armAsm->str(RARG2, PTR(&g_state.pc));
   2059     m_dirty_pc = false;
   2060     EndAndLinkBlock(std::nullopt, false, true);
   2061   }
   2062 
   2063   RestoreHostState();
   2064   SwitchToNearCode(false);
   2065 
   2066   armAsm->bind(&no_interrupt);
   2067 }
   2068 
   2069 void CPU::NewRec::AArch32Compiler::Compile_mfc2(CompileFlags cf)
   2070 {
   2071   const u32 index = inst->cop.Cop2Index();
   2072   const Reg rt = inst->r.rt;
   2073 
   2074   const auto [ptr, action] = GetGTERegisterPointer(index, false);
   2075   if (action == GTERegisterAccessAction::Ignore)
   2076     return;
   2077 
   2078   u32 hreg;
   2079   if (action == GTERegisterAccessAction::Direct)
   2080   {
   2081     hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   2082                            EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
   2083     armAsm->ldr(Register(hreg), PTR(ptr));
   2084   }
   2085   else if (action == GTERegisterAccessAction::CallHandler)
   2086   {
   2087     Flush(FLUSH_FOR_C_CALL);
   2088     EmitMov(RARG1, index);
   2089     EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
   2090 
   2091     hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   2092                            EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
   2093     armAsm->mov(Register(hreg), RRET);
   2094   }
   2095   else
   2096   {
   2097     Panic("Unknown action");
   2098     return;
   2099   }
   2100 
   2101   if (g_settings.gpu_pgxp_enable)
   2102   {
   2103     Flush(FLUSH_FOR_C_CALL);
   2104     EmitMov(RARG1, inst->bits);
   2105     armAsm->mov(RARG2, Register(hreg));
   2106     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
   2107   }
   2108 }
   2109 
   2110 void CPU::NewRec::AArch32Compiler::Compile_mtc2(CompileFlags cf)
   2111 {
   2112   const u32 index = inst->cop.Cop2Index();
   2113   const auto [ptr, action] = GetGTERegisterPointer(index, true);
   2114   if (action == GTERegisterAccessAction::Ignore)
   2115     return;
   2116 
   2117   if (action == GTERegisterAccessAction::Direct)
   2118   {
   2119     if (cf.const_t)
   2120       StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
   2121     else
   2122       armAsm->str(CFGetRegT(cf), PTR(ptr));
   2123   }
   2124   else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
   2125   {
   2126     const bool sign = (action == GTERegisterAccessAction::SignExtend16);
   2127     if (cf.valid_host_t)
   2128     {
   2129       sign ? armAsm->sxth(RARG1, CFGetRegT(cf)) : armAsm->uxth(RARG1, CFGetRegT(cf));
   2130       armAsm->str(RARG1, PTR(ptr));
   2131     }
   2132     else if (cf.const_t)
   2133     {
   2134       const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
   2135       StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
   2136     }
   2137     else
   2138     {
   2139       Panic("Unsupported setup");
   2140     }
   2141   }
   2142   else if (action == GTERegisterAccessAction::CallHandler)
   2143   {
   2144     Flush(FLUSH_FOR_C_CALL);
   2145     EmitMov(RARG1, index);
   2146     MoveTToReg(RARG2, cf);
   2147     EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
   2148   }
   2149   else if (action == GTERegisterAccessAction::PushFIFO)
   2150   {
   2151     // SXY0 <- SXY1
   2152     // SXY1 <- SXY2
   2153     // SXY2 <- SXYP
   2154     DebugAssert(RRET.GetCode() != RARG2.GetCode() && RRET.GetCode() != RARG3.GetCode());
   2155     armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
   2156     armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
   2157     armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
   2158     armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
   2159     if (cf.valid_host_t)
   2160       armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
   2161     else if (cf.const_t)
   2162       StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
   2163     else
   2164       Panic("Unsupported setup");
   2165   }
   2166   else
   2167   {
   2168     Panic("Unknown action");
   2169   }
   2170 }
   2171 
   2172 void CPU::NewRec::AArch32Compiler::Compile_cop2(CompileFlags cf)
   2173 {
   2174   TickCount func_ticks;
   2175   GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
   2176 
   2177   Flush(FLUSH_FOR_C_CALL);
   2178   EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
   2179   EmitCall(reinterpret_cast<const void*>(func));
   2180 
   2181   AddGTETicks(func_ticks);
   2182 }
   2183 
   2184 u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
   2185                                        TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
   2186                                        u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
   2187                                        bool is_load)
   2188 {
   2189   Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
   2190   Assembler* armAsm = &arm_asm;
   2191 
   2192 #ifdef VIXL_DEBUG
   2193   vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
   2194 #endif
   2195 
   2196   // save regs
   2197   RegisterList save_regs;
   2198 
   2199   for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2200   {
   2201     if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
   2202       save_regs.Combine(RegisterList(Register(i)));
   2203   }
   2204 
   2205   if (!save_regs.IsEmpty())
   2206     armAsm->push(save_regs);
   2207 
   2208   if (address_register != static_cast<u8>(RARG1.GetCode()))
   2209     armAsm->mov(RARG1, Register(address_register));
   2210 
   2211   if (!is_load)
   2212   {
   2213     if (data_register != static_cast<u8>(RARG2.GetCode()))
   2214       armAsm->mov(RARG2, Register(data_register));
   2215   }
   2216 
   2217   if (cycles_to_add != 0)
   2218   {
   2219     // NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
   2220     armAsm->ldr(RARG3, PTR(&g_state.pending_ticks));
   2221     if (!ImmediateA32::IsImmediateA32(cycles_to_add))
   2222     {
   2223       armEmitMov(armAsm, RSCRATCH, cycles_to_add);
   2224       armAsm->add(RARG3, RARG3, RSCRATCH);
   2225     }
   2226     else
   2227     {
   2228       armAsm->add(RARG3, RARG3, cycles_to_add);
   2229     }
   2230 
   2231     armAsm->str(RARG3, PTR(&g_state.pending_ticks));
   2232   }
   2233 
   2234   switch (size)
   2235   {
   2236     case MemoryAccessSize::Byte:
   2237     {
   2238       armEmitCall(armAsm,
   2239                   is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte) :
   2240                             reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte),
   2241                   false);
   2242     }
   2243     break;
   2244     case MemoryAccessSize::HalfWord:
   2245     {
   2246       armEmitCall(armAsm,
   2247                   is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) :
   2248                             reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord),
   2249                   false);
   2250     }
   2251     break;
   2252     case MemoryAccessSize::Word:
   2253     {
   2254       armEmitCall(armAsm,
   2255                   is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord) :
   2256                             reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord),
   2257                   false);
   2258     }
   2259     break;
   2260   }
   2261 
   2262   if (is_load)
   2263   {
   2264     const Register dst = Register(data_register);
   2265     switch (size)
   2266     {
   2267       case MemoryAccessSize::Byte:
   2268       {
   2269         is_signed ? armAsm->sxtb(dst, RRET) : armAsm->uxtb(dst, RRET);
   2270       }
   2271       break;
   2272       case MemoryAccessSize::HalfWord:
   2273       {
   2274         is_signed ? armAsm->sxth(dst, RRET) : armAsm->uxth(dst, RRET);
   2275       }
   2276       break;
   2277       case MemoryAccessSize::Word:
   2278       {
   2279         if (dst.GetCode() != RRET.GetCode())
   2280           armAsm->mov(dst, RRET);
   2281       }
   2282       break;
   2283     }
   2284   }
   2285 
   2286   if (cycles_to_remove != 0)
   2287   {
   2288     armAsm->ldr(RARG3, PTR(&g_state.pending_ticks));
   2289     if (!ImmediateA32::IsImmediateA32(cycles_to_remove))
   2290     {
   2291       armEmitMov(armAsm, RSCRATCH, cycles_to_remove);
   2292       armAsm->sub(RARG3, RARG3, RSCRATCH);
   2293     }
   2294     else
   2295     {
   2296       armAsm->sub(RARG3, RARG3, cycles_to_remove);
   2297     }
   2298     armAsm->str(RARG3, PTR(&g_state.pending_ticks));
   2299   }
   2300 
   2301   // restore regs
   2302   if (!save_regs.IsEmpty())
   2303     armAsm->pop(save_regs);
   2304 
   2305   armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);
   2306   armAsm->FinalizeCode();
   2307 
   2308   return static_cast<u32>(armAsm->GetCursorOffset());
   2309 }
   2310 
   2311 #endif // CPU_ARCH_ARM32