duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

cpu_newrec_compiler_x64.cpp (65933B)


      1 // SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "cpu_newrec_compiler_x64.h"
      5 #include "common/align.h"
      6 #include "common/assert.h"
      7 #include "common/log.h"
      8 #include "common/string_util.h"
      9 #include "cpu_code_cache_private.h"
     10 #include "cpu_core_private.h"
     11 #include "cpu_pgxp.h"
     12 #include "cpu_recompiler_thunks.h"
     13 #include "cpu_recompiler_types.h"
     14 #include "gte.h"
     15 #include "settings.h"
     16 #include "timing_event.h"
     17 #include <limits>
     18 
     19 #ifdef CPU_ARCH_X64
     20 
     21 Log_SetChannel(CPU::NewRec);
     22 
     23 #define RMEMBASE cg->rbx
     24 #define RSTATE cg->rbp
     25 
     26 // #define PTR(x) (cg->rip + (x))
     27 #define PTR(x) (RSTATE + (u32)(((u8*)(x)) - ((u8*)&g_state)))
     28 
     29 // PGXP TODO: LWL etc, MFC0
     30 // PGXP TODO: Spyro 1 level gates have issues.
     31 
     32 static constexpr u32 BACKPATCH_JMP_SIZE = 5;
     33 
     34 // on win32, we need to reserve an additional 32 bytes shadow space when calling out to C
     35 #ifdef _WIN32
     36 static constexpr u32 STACK_SHADOW_SIZE = 32;
     37 #else
     38 static constexpr u32 STACK_SHADOW_SIZE = 0;
     39 #endif
     40 
     41 using namespace Xbyak;
     42 
     43 using CPU::Recompiler::IsCallerSavedRegister;
     44 
     45 // TODO: try using a pointer to state instead of rip-relative.. it might end up faster due to smaller code
     46 
     47 namespace CPU::NewRec {
     48 X64Compiler s_instance;
     49 Compiler* g_compiler = &s_instance;
     50 } // namespace CPU::NewRec
     51 
     52 CPU::NewRec::X64Compiler::X64Compiler() = default;
     53 
     54 CPU::NewRec::X64Compiler::~X64Compiler() = default;
     55 
     56 void CPU::NewRec::X64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
     57                                      u8* far_code_buffer, u32 far_code_space)
     58 {
     59   Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
     60 
     61   // TODO: don't recreate this every time..
     62   DebugAssert(!m_emitter && !m_far_emitter && !cg);
     63   m_emitter = std::make_unique<Xbyak::CodeGenerator>(code_buffer_space, code_buffer);
     64   m_far_emitter = std::make_unique<Xbyak::CodeGenerator>(far_code_space, far_code_buffer);
     65   cg = m_emitter.get();
     66 
     67   // Need to wipe it out so it's correct when toggling fastmem.
     68   m_host_regs = {};
     69 
     70   const u32 membase_idx = CodeCache::IsUsingFastmem() ? static_cast<u32>(RMEMBASE.getIdx()) : NUM_HOST_REGS;
     71   const u32 cpu_idx = static_cast<u32>(RSTATE.getIdx());
     72   for (u32 i = 0; i < NUM_HOST_REGS; i++)
     73   {
     74     HostRegAlloc& ra = m_host_regs[i];
     75 
     76     if (i == static_cast<u32>(RWRET.getIdx()) || i == static_cast<u32>(RWARG1.getIdx()) ||
     77         i == static_cast<u32>(RWARG2.getIdx()) || i == static_cast<u32>(RWARG3.getIdx()) ||
     78         i == static_cast<u32>(cg->rsp.getIdx()) || i == cpu_idx || i == membase_idx ||
     79         i == static_cast<u32>(cg->ecx.getIdx()) /* keep ecx free for shifts, maybe use BMI? */)
     80     {
     81       continue;
     82     }
     83 
     84     ra.flags = HR_USABLE | (IsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
     85   }
     86 }
     87 
     88 void CPU::NewRec::X64Compiler::SwitchToFarCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*))
     89 {
     90   DebugAssert(cg == m_emitter.get());
     91   if (emit_jump)
     92   {
     93     const void* fcptr = m_far_emitter->getCurr<const void*>();
     94     (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr);
     95   }
     96   cg = m_far_emitter.get();
     97 }
     98 
     99 void CPU::NewRec::X64Compiler::SwitchToNearCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*))
    100 {
    101   DebugAssert(cg == m_far_emitter.get());
    102   if (emit_jump)
    103   {
    104     const void* fcptr = m_emitter->getCurr<const void*>();
    105     (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr);
    106   }
    107   cg = m_emitter.get();
    108 }
    109 
    110 void CPU::NewRec::X64Compiler::BeginBlock()
    111 {
    112   Compiler::BeginBlock();
    113 
    114 #if 0
    115   if (m_block->pc == 0xBFC06F0C)
    116   {
    117     //__debugbreak();
    118     cg->db(0xcc);
    119   }
    120 #endif
    121 
    122 #if 0
    123   cg->nop();
    124   cg->mov(RWARG1, m_block->pc);
    125   cg->nop();
    126 #endif
    127 }
    128 
    129 void CPU::NewRec::X64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
    130 {
    131   // store it first to reduce code size, because we can offset
    132   cg->mov(RXARG1, static_cast<size_t>(reinterpret_cast<uintptr_t>(ram_ptr)));
    133   cg->mov(RXARG2, static_cast<size_t>(reinterpret_cast<uintptr_t>(shadow_ptr)));
    134 
    135   bool first = true;
    136   u32 offset = 0;
    137   while (size >= 16)
    138   {
    139     const Xbyak::Xmm& dst = first ? cg->xmm0 : cg->xmm1;
    140     cg->movups(dst, cg->xword[RXARG1 + offset]);
    141     cg->pcmpeqd(dst, cg->xword[RXARG2 + offset]);
    142     if (!first)
    143       cg->pand(cg->xmm0, dst);
    144     else
    145       first = false;
    146 
    147     offset += 16;
    148     size -= 16;
    149   }
    150 
    151   // TODO: better codegen for 16 byte aligned blocks
    152   if (!first)
    153   {
    154     cg->movmskps(cg->eax, cg->xmm0);
    155     cg->cmp(cg->eax, 0xf);
    156     cg->jne(CodeCache::g_discard_and_recompile_block);
    157   }
    158 
    159   while (size >= 8)
    160   {
    161     cg->mov(RXARG3, cg->qword[RXARG1 + offset]);
    162     cg->cmp(RXARG3, cg->qword[RXARG2 + offset]);
    163     cg->jne(CodeCache::g_discard_and_recompile_block);
    164     offset += 8;
    165     size -= 8;
    166   }
    167 
    168   while (size >= 4)
    169   {
    170     cg->mov(RWARG3, cg->dword[RXARG1 + offset]);
    171     cg->cmp(RWARG3, cg->dword[RXARG2 + offset]);
    172     cg->jne(CodeCache::g_discard_and_recompile_block);
    173     offset += 4;
    174     size -= 4;
    175   }
    176 
    177   DebugAssert(size == 0);
    178 }
    179 
    180 void CPU::NewRec::X64Compiler::GenerateICacheCheckAndUpdate()
    181 {
    182   if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
    183   {
    184     if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
    185     {
    186       cg->mov(cg->eax, m_block->size);
    187       cg->mul(cg->dword[cg->rip + GetFetchMemoryAccessTimePtr()]);
    188       cg->add(cg->dword[PTR(&g_state.pending_ticks)], cg->eax);
    189     }
    190     else
    191     {
    192       cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(m_block->uncached_fetch_ticks));
    193     }
    194   }
    195   else if (m_block->icache_line_count > 0)
    196   {
    197     cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]);
    198 
    199     // TODO: Vectorize this...
    200     VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
    201     for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
    202     {
    203       const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc);
    204       const TickCount fill_ticks = GetICacheFillTicks(current_pc);
    205       if (fill_ticks <= 0)
    206         continue;
    207 
    208       const u32 line = GetICacheLine(current_pc);
    209       const u32 offset = (line * sizeof(u32));
    210       Xbyak::Label cache_hit;
    211 
    212       cg->cmp(cg->dword[RXARG1 + offset], tag);
    213       cg->je(cache_hit);
    214       cg->mov(cg->dword[RXARG1 + offset], tag);
    215       cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(fill_ticks));
    216       cg->L(cache_hit);
    217     }
    218   }
    219 }
    220 
    221 void CPU::NewRec::X64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
    222                                             s32 arg3reg /*= -1*/)
    223 {
    224   if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.getIdx()))
    225     cg->mov(RXARG1, Reg64(arg1reg));
    226   if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.getIdx()))
    227     cg->mov(RXARG2, Reg64(arg2reg));
    228   if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.getIdx()))
    229     cg->mov(RXARG3, Reg64(arg3reg));
    230   cg->call(func);
    231 }
    232 
    233 void CPU::NewRec::X64Compiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
    234 {
    235   if (newpc.has_value())
    236   {
    237     if (m_dirty_pc || m_compiler_pc != newpc)
    238       cg->mov(cg->dword[PTR(&g_state.pc)], newpc.value());
    239   }
    240   m_dirty_pc = false;
    241 
    242   // flush regs
    243   Flush(FLUSH_END_BLOCK);
    244   EndAndLinkBlock(newpc, do_event_test, false);
    245 }
    246 
    247 void CPU::NewRec::X64Compiler::EndBlockWithException(Exception excode)
    248 {
    249   // flush regs, but not pc, it's going to get overwritten
    250   // flush cycles because of the GTE instruction stuff...
    251   Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
    252 
    253   // TODO: flush load delay
    254   // TODO: break for pcdrv
    255 
    256   cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
    257                                                               inst->cop.cop_n));
    258   cg->mov(RWARG2, m_current_instruction_pc);
    259   cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException));
    260   m_dirty_pc = false;
    261 
    262   EndAndLinkBlock(std::nullopt, true, false);
    263 }
    264 
    265 void CPU::NewRec::X64Compiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test,
    266                                                bool force_run_events)
    267 {
    268   // event test
    269   // pc should've been flushed
    270   DebugAssert(!m_dirty_pc && !m_block_ended);
    271   m_block_ended = true;
    272 
    273   // TODO: try extracting this to a function
    274 
    275   // save cycles for event test
    276   const TickCount cycles = std::exchange(m_cycles, 0);
    277 
    278   // fast path when not doing an event test
    279   if (!do_event_test && m_gte_done_cycle <= cycles)
    280   {
    281     if (cycles == 1)
    282       cg->inc(cg->dword[PTR(&g_state.pending_ticks)]);
    283     else if (cycles > 0)
    284       cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles);
    285 
    286     if (force_run_events)
    287     {
    288       cg->jmp(CodeCache::g_run_events_and_dispatch);
    289       return;
    290     }
    291   }
    292   else
    293   {
    294     // pending_ticks += cycles
    295     // if (pending_ticks >= downcount) { dispatch_event(); }
    296     if (do_event_test || cycles > 0 || m_gte_done_cycle > cycles)
    297       cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
    298     if (cycles > 0)
    299       cg->add(RWARG1, cycles);
    300     if (m_gte_done_cycle > cycles)
    301     {
    302       cg->mov(RWARG2, RWARG1);
    303       ((m_gte_done_cycle - cycles) == 1) ? cg->inc(RWARG2) : cg->add(RWARG2, m_gte_done_cycle - cycles);
    304       cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG2);
    305     }
    306     if (do_event_test)
    307       cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]);
    308     if (cycles > 0)
    309       cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
    310     if (do_event_test)
    311       cg->jge(CodeCache::g_run_events_and_dispatch);
    312   }
    313 
    314   // jump to dispatcher or next block
    315   if (!newpc.has_value())
    316   {
    317     cg->jmp(CodeCache::g_dispatcher);
    318   }
    319   else
    320   {
    321     if (newpc.value() == m_block->pc)
    322     {
    323       // Special case: ourselves! No need to backlink then.
    324       DEBUG_LOG("Linking block at {:08X} to self", m_block->pc);
    325       cg->jmp(cg->getCode());
    326     }
    327     else
    328     {
    329       const void* target = CodeCache::CreateBlockLink(m_block, cg->getCurr<void*>(), newpc.value());
    330       cg->jmp(target, CodeGenerator::T_NEAR);
    331     }
    332   }
    333 }
    334 
    335 const void* CPU::NewRec::X64Compiler::EndCompile(u32* code_size, u32* far_code_size)
    336 {
    337   const void* code = m_emitter->getCode();
    338   *code_size = static_cast<u32>(m_emitter->getSize());
    339   *far_code_size = static_cast<u32>(m_far_emitter->getSize());
    340   cg = nullptr;
    341   m_far_emitter.reset();
    342   m_emitter.reset();
    343   return code;
    344 }
    345 
    346 const void* CPU::NewRec::X64Compiler::GetCurrentCodePointer()
    347 {
    348   return cg->getCurr();
    349 }
    350 
    351 const char* CPU::NewRec::X64Compiler::GetHostRegName(u32 reg) const
    352 {
    353   static constexpr std::array<const char*, 16> reg64_names = {
    354     {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"}};
    355   return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
    356 }
    357 
    358 void CPU::NewRec::X64Compiler::LoadHostRegWithConstant(u32 reg, u32 val)
    359 {
    360   cg->mov(Reg32(reg), val);
    361 }
    362 
    363 void CPU::NewRec::X64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
    364 {
    365   cg->mov(Reg32(reg), cg->dword[PTR(ptr)]);
    366 }
    367 
    368 void CPU::NewRec::X64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
    369 {
    370   cg->mov(cg->dword[PTR(ptr)], Reg32(reg));
    371 }
    372 
    373 void CPU::NewRec::X64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
    374 {
    375   cg->mov(cg->dword[PTR(ptr)], val);
    376 }
    377 
    378 void CPU::NewRec::X64Compiler::CopyHostReg(u32 dst, u32 src)
    379 {
    380   if (src != dst)
    381     cg->mov(Reg32(dst), Reg32(src));
    382 }
    383 
    384 Xbyak::Address CPU::NewRec::X64Compiler::MipsPtr(Reg r) const
    385 {
    386   DebugAssert(r < Reg::count);
    387   return cg->dword[PTR(&g_state.regs.r[static_cast<u32>(r)])];
    388 }
    389 
    390 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegD(CompileFlags cf) const
    391 {
    392   DebugAssert(cf.valid_host_d);
    393   return Reg32(cf.host_d);
    394 }
    395 
    396 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegS(CompileFlags cf) const
    397 {
    398   DebugAssert(cf.valid_host_s);
    399   return Reg32(cf.host_s);
    400 }
    401 
    402 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegT(CompileFlags cf) const
    403 {
    404   DebugAssert(cf.valid_host_t);
    405   return Reg32(cf.host_t);
    406 }
    407 
    408 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegLO(CompileFlags cf) const
    409 {
    410   DebugAssert(cf.valid_host_lo);
    411   return Reg32(cf.host_lo);
    412 }
    413 
    414 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegHI(CompileFlags cf) const
    415 {
    416   DebugAssert(cf.valid_host_hi);
    417   return Reg32(cf.host_hi);
    418 }
    419 
    420 Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToD(CompileFlags cf)
    421 {
    422   DebugAssert(cf.valid_host_d);
    423   DebugAssert(!cf.valid_host_t || cf.host_t != cf.host_d);
    424 
    425   const Reg32 rd = CFGetRegD(cf);
    426   MoveSToReg(rd, cf);
    427 
    428   return rd;
    429 }
    430 
    431 Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToT(CompileFlags cf)
    432 {
    433   DebugAssert(cf.valid_host_t);
    434 
    435   const Reg32 rt = CFGetRegT(cf);
    436   if (cf.valid_host_s)
    437   {
    438     const Reg32 rs = CFGetRegS(cf);
    439     if (rt != rs)
    440       cg->mov(rt, rs);
    441   }
    442   else if (cf.const_s)
    443   {
    444     if (const u32 cv = GetConstantRegU32(cf.MipsS()); cv != 0)
    445       cg->mov(rt, cv);
    446     else
    447       cg->xor_(rt, rt);
    448   }
    449   else
    450   {
    451     cg->mov(rt, MipsPtr(cf.MipsS()));
    452   }
    453 
    454   return rt;
    455 }
    456 
    457 Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveTToD(CompileFlags cf)
    458 {
    459   DebugAssert(cf.valid_host_d);
    460   DebugAssert(!cf.valid_host_s || cf.host_s != cf.host_d);
    461 
    462   const Reg32 rd = CFGetRegD(cf);
    463   MoveTToReg(rd, cf);
    464   return rd;
    465 }
    466 
    467 void CPU::NewRec::X64Compiler::MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf)
    468 {
    469   if (cf.valid_host_s)
    470   {
    471     if (cf.host_s != static_cast<u32>(dst.getIdx()))
    472       cg->mov(dst, Reg32(cf.host_s));
    473   }
    474   else if (cf.const_s)
    475   {
    476     const u32 cv = GetConstantRegU32(cf.MipsS());
    477     if (cv == 0)
    478       cg->xor_(dst, dst);
    479     else
    480       cg->mov(dst, cv);
    481   }
    482   else
    483   {
    484     cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_s])]);
    485   }
    486 }
    487 
    488 void CPU::NewRec::X64Compiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf)
    489 {
    490   if (cf.valid_host_t)
    491   {
    492     if (cf.host_t != static_cast<u32>(dst.getIdx()))
    493       cg->mov(dst, Reg32(cf.host_t));
    494   }
    495   else if (cf.const_t)
    496   {
    497     const u32 cv = GetConstantRegU32(cf.MipsT());
    498     if (cv == 0)
    499       cg->xor_(dst, dst);
    500     else
    501       cg->mov(dst, cv);
    502   }
    503   else
    504   {
    505     cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_t])]);
    506   }
    507 }
    508 
    509 void CPU::NewRec::X64Compiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg)
    510 {
    511   DebugAssert(reg < Reg::count);
    512   if (const std::optional<u32> hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg))
    513     cg->mov(dst, Reg32(hreg.value()));
    514   else if (HasConstantReg(reg))
    515     cg->mov(dst, GetConstantRegU32(reg));
    516   else
    517     cg->mov(dst, MipsPtr(reg));
    518 }
    519 
    520 void CPU::NewRec::X64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
    521                                                             Reg arg2reg /* = Reg::count */,
    522                                                             Reg arg3reg /* = Reg::count */)
    523 {
    524   DebugAssert(g_settings.gpu_pgxp_enable);
    525 
    526   Flush(FLUSH_FOR_C_CALL);
    527 
    528   if (arg2reg != Reg::count)
    529     MoveMIPSRegToReg(RWARG2, arg2reg);
    530   if (arg3reg != Reg::count)
    531     MoveMIPSRegToReg(RWARG3, arg3reg);
    532 
    533   cg->mov(RWARG1, arg1val);
    534   cg->call(func);
    535 }
    536 
    537 void CPU::NewRec::X64Compiler::Flush(u32 flags)
    538 {
    539   Compiler::Flush(flags);
    540 
    541   if (flags & FLUSH_PC && m_dirty_pc)
    542   {
    543     cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc);
    544     m_dirty_pc = false;
    545   }
    546 
    547   if (flags & FLUSH_INSTRUCTION_BITS)
    548   {
    549     cg->mov(cg->dword[PTR(&g_state.current_instruction.bits)], inst->bits);
    550     cg->mov(cg->dword[PTR(&g_state.current_instruction_pc)], m_current_instruction_pc);
    551     cg->mov(cg->byte[PTR(&g_state.current_instruction_in_branch_delay_slot)], m_current_instruction_branch_delay_slot);
    552   }
    553 
    554   if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
    555   {
    556     // This sucks :(
    557     // TODO: make it a function?
    558     cg->movzx(RWARG1, cg->byte[PTR(&g_state.load_delay_reg)]);
    559     cg->mov(RWARG2, cg->dword[PTR(&g_state.load_delay_value)]);
    560     cg->mov(cg->dword[PTR(&g_state.regs.r[0]) + RXARG1 * 4], RWARG2);
    561     cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast<u8>(Reg::count));
    562     m_load_delay_dirty = false;
    563   }
    564 
    565   if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
    566   {
    567     if (m_load_delay_value_register != NUM_HOST_REGS)
    568       FreeHostReg(m_load_delay_value_register);
    569 
    570     cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast<u8>(m_load_delay_register));
    571     m_load_delay_register = Reg::count;
    572     m_load_delay_dirty = true;
    573   }
    574 
    575   if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
    576   {
    577     // May as well flush cycles while we're here.
    578     // GTE spanning blocks is very rare, we _could_ disable this for speed.
    579     cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
    580     cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_completion_tick)]);
    581     if (m_cycles > 0)
    582     {
    583       (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles);
    584       m_cycles = 0;
    585     }
    586     cg->cmp(RWARG2, RWARG1);
    587     cg->cmova(RWARG1, RWARG2);
    588     cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
    589     m_dirty_gte_done_cycle = false;
    590   }
    591 
    592   if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
    593   {
    594     cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
    595 
    596     // update cycles at the same time
    597     if (flags & FLUSH_CYCLES && m_cycles > 0)
    598     {
    599       (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles);
    600       cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
    601       m_gte_done_cycle -= m_cycles;
    602       m_cycles = 0;
    603     }
    604 
    605     (m_gte_done_cycle == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_gte_done_cycle);
    606     cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG1);
    607     m_gte_done_cycle = 0;
    608     m_dirty_gte_done_cycle = true;
    609   }
    610 
    611   if (flags & FLUSH_CYCLES && m_cycles > 0)
    612   {
    613     (m_cycles == 1) ? cg->inc(cg->dword[PTR(&g_state.pending_ticks)]) :
    614                       cg->add(cg->dword[PTR(&g_state.pending_ticks)], m_cycles);
    615     m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
    616     m_cycles = 0;
    617   }
    618 }
    619 
    620 void CPU::NewRec::X64Compiler::Compile_Fallback()
    621 {
    622   WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", iinfo->pc, inst->bits);
    623 
    624   Flush(FLUSH_FOR_INTERPRETER);
    625 
    626   cg->call(&CPU::Recompiler::Thunks::InterpretInstruction);
    627 
    628   // TODO: make me less garbage
    629   // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
    630   // but nothing should be going through here..
    631   Label no_load_delay;
    632   cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]);
    633   cg->cmp(RWARG1, static_cast<u8>(Reg::count));
    634   cg->je(no_load_delay, CodeGenerator::T_SHORT);
    635   cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]);
    636   cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1);
    637   cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2);
    638   cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast<u32>(Reg::count));
    639   cg->L(no_load_delay);
    640 
    641   m_load_delay_dirty = EMULATE_LOAD_DELAYS;
    642 }
    643 
    644 void CPU::NewRec::X64Compiler::CheckBranchTarget(const Xbyak::Reg32& pcreg)
    645 {
    646   if (!g_settings.cpu_recompiler_memory_exceptions)
    647     return;
    648 
    649   cg->test(pcreg, 0x3);
    650   SwitchToFarCode(true, &CodeGenerator::jnz);
    651 
    652   BackupHostState();
    653   EndBlockWithException(Exception::AdEL);
    654 
    655   RestoreHostState();
    656   SwitchToNearCode(false);
    657 }
    658 
    659 void CPU::NewRec::X64Compiler::Compile_jr(CompileFlags cf)
    660 {
    661   if (!cf.valid_host_s)
    662     cg->mov(RWARG1, MipsPtr(cf.MipsS()));
    663 
    664   const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
    665   CheckBranchTarget(pcreg);
    666 
    667   cg->mov(cg->dword[PTR(&g_state.pc)], pcreg);
    668 
    669   CompileBranchDelaySlot(false);
    670   EndBlock(std::nullopt, true);
    671 }
    672 
    673 void CPU::NewRec::X64Compiler::Compile_jalr(CompileFlags cf)
    674 {
    675   if (!cf.valid_host_s)
    676     cg->mov(RWARG1, MipsPtr(cf.MipsS()));
    677 
    678   const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
    679 
    680   if (MipsD() != Reg::zero)
    681     SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
    682 
    683   CheckBranchTarget(pcreg);
    684   cg->mov(cg->dword[PTR(&g_state.pc)], pcreg);
    685 
    686   CompileBranchDelaySlot(false);
    687   EndBlock(std::nullopt, true);
    688 }
    689 
    690 void CPU::NewRec::X64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
    691 {
    692   const u32 taken_pc = GetConditionalBranchTarget(cf);
    693 
    694   Flush(FLUSH_FOR_BRANCH);
    695 
    696   DebugAssert(cf.valid_host_s);
    697 
    698   // MipsT() here should equal zero for zero branches.
    699   DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
    700 
    701   // TODO: Swap this back to near once instructions don't blow up
    702   constexpr CodeGenerator::LabelType type = CodeGenerator::T_NEAR;
    703   Label taken;
    704   switch (cond)
    705   {
    706     case BranchCondition::Equal:
    707     case BranchCondition::NotEqual:
    708     {
    709       // we should always have S, maybe not T
    710       // TODO: if it's zero, we can just do test rs, rs
    711       if (cf.valid_host_t)
    712         cg->cmp(CFGetRegS(cf), CFGetRegT(cf));
    713       else if (cf.const_t)
    714         cg->cmp(CFGetRegS(cf), GetConstantRegU32(cf.MipsT()));
    715       else
    716         cg->cmp(CFGetRegS(cf), MipsPtr(cf.MipsT()));
    717 
    718       (cond == BranchCondition::Equal) ? cg->je(taken, type) : cg->jne(taken, type);
    719     }
    720     break;
    721 
    722     case BranchCondition::GreaterThanZero:
    723     {
    724       cg->cmp(CFGetRegS(cf), 0);
    725       cg->jg(taken, type);
    726     }
    727     break;
    728 
    729     case BranchCondition::GreaterEqualZero:
    730     {
    731       cg->test(CFGetRegS(cf), CFGetRegS(cf));
    732       cg->jns(taken, type);
    733     }
    734     break;
    735 
    736     case BranchCondition::LessThanZero:
    737     {
    738       cg->test(CFGetRegS(cf), CFGetRegS(cf));
    739       cg->js(taken, type);
    740     }
    741     break;
    742 
    743     case BranchCondition::LessEqualZero:
    744     {
    745       cg->cmp(CFGetRegS(cf), 0);
    746       cg->jle(taken, type);
    747     }
    748     break;
    749   }
    750 
    751   BackupHostState();
    752   if (!cf.delay_slot_swapped)
    753     CompileBranchDelaySlot();
    754 
    755   EndBlock(m_compiler_pc, true);
    756 
    757   cg->L(taken);
    758 
    759   RestoreHostState();
    760   if (!cf.delay_slot_swapped)
    761     CompileBranchDelaySlot();
    762 
    763   EndBlock(taken_pc, true);
    764 }
    765 
    766 void CPU::NewRec::X64Compiler::Compile_addi(CompileFlags cf)
    767 {
    768   const Reg32 rt = MoveSToT(cf);
    769   if (const u32 imm = inst->i.imm_sext32(); imm != 0)
    770   {
    771     cg->add(rt, imm);
    772     if (g_settings.cpu_recompiler_memory_exceptions)
    773     {
    774       DebugAssert(cf.valid_host_t);
    775       TestOverflow(rt);
    776     }
    777   }
    778 }
    779 
    780 void CPU::NewRec::X64Compiler::Compile_addiu(CompileFlags cf)
    781 {
    782   const Reg32 rt = MoveSToT(cf);
    783   if (const u32 imm = inst->i.imm_sext32(); imm != 0)
    784     cg->add(rt, imm);
    785 }
    786 
    787 void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf)
    788 {
    789   Compile_slti(cf, true);
    790 }
    791 
    792 void CPU::NewRec::X64Compiler::Compile_sltiu(CompileFlags cf)
    793 {
    794   Compile_slti(cf, false);
    795 }
    796 
    797 void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf, bool sign)
    798 {
    799   const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
    800 
    801   // Case where T == S, can't use xor because it changes flags
    802   if (!cf.valid_host_t || !cf.valid_host_s || cf.host_t != cf.host_s)
    803     cg->xor_(rt, rt);
    804 
    805   if (cf.valid_host_s)
    806     cg->cmp(CFGetRegS(cf), inst->i.imm_sext32());
    807   else
    808     cg->cmp(MipsPtr(cf.MipsS()), inst->i.imm_sext32());
    809 
    810   if (cf.valid_host_t && cf.valid_host_s && cf.host_t == cf.host_s)
    811     cg->mov(rt, 0);
    812 
    813   sign ? cg->setl(rt.cvt8()) : cg->setb(rt.cvt8());
    814 
    815   if (!cf.valid_host_t)
    816     cg->mov(MipsPtr(cf.MipsT()), rt);
    817 }
    818 
    819 void CPU::NewRec::X64Compiler::Compile_andi(CompileFlags cf)
    820 {
    821   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    822   {
    823     const Reg32 rt = MoveSToT(cf);
    824     cg->and_(rt, imm);
    825   }
    826   else
    827   {
    828     const Reg32 rt = CFGetRegT(cf);
    829     cg->xor_(rt, rt);
    830   }
    831 }
    832 
    833 void CPU::NewRec::X64Compiler::Compile_ori(CompileFlags cf)
    834 {
    835   const Reg32 rt = MoveSToT(cf);
    836   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    837     cg->or_(rt, imm);
    838 }
    839 
    840 void CPU::NewRec::X64Compiler::Compile_xori(CompileFlags cf)
    841 {
    842   const Reg32 rt = MoveSToT(cf);
    843   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
    844     cg->xor_(rt, imm);
    845 }
    846 
    847 void CPU::NewRec::X64Compiler::Compile_sll(CompileFlags cf)
    848 {
    849   const Reg32 rd = MoveTToD(cf);
    850   if (inst->r.shamt > 0)
    851     cg->shl(rd, inst->r.shamt);
    852 }
    853 
    854 void CPU::NewRec::X64Compiler::Compile_srl(CompileFlags cf)
    855 {
    856   const Reg32 rd = MoveTToD(cf);
    857   if (inst->r.shamt > 0)
    858     cg->shr(rd, inst->r.shamt);
    859 }
    860 
    861 void CPU::NewRec::X64Compiler::Compile_sra(CompileFlags cf)
    862 {
    863   const Reg32 rd = MoveTToD(cf);
    864   if (inst->r.shamt > 0)
    865     cg->sar(rd, inst->r.shamt);
    866 }
    867 
    868 void CPU::NewRec::X64Compiler::Compile_variable_shift(
    869   CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Reg8&),
    870   void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, int))
    871 {
    872   const Reg32 rd = CFGetRegD(cf);
    873   if (!cf.const_s)
    874   {
    875     MoveSToReg(cg->ecx, cf);
    876     MoveTToReg(rd, cf);
    877     (cg->*op)(rd, cg->cl);
    878   }
    879   else
    880   {
    881     MoveTToReg(rd, cf);
    882     (cg->*op_const)(rd, GetConstantRegU32(cf.MipsS()));
    883   }
    884 }
    885 
    886 void CPU::NewRec::X64Compiler::Compile_sllv(CompileFlags cf)
    887 {
    888   Compile_variable_shift(cf, &CodeGenerator::shl, &CodeGenerator::shl);
    889 }
    890 
    891 void CPU::NewRec::X64Compiler::Compile_srlv(CompileFlags cf)
    892 {
    893   Compile_variable_shift(cf, &CodeGenerator::shr, &CodeGenerator::shr);
    894 }
    895 
    896 void CPU::NewRec::X64Compiler::Compile_srav(CompileFlags cf)
    897 {
    898   Compile_variable_shift(cf, &CodeGenerator::sar, &CodeGenerator::sar);
    899 }
    900 
    901 void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf, bool sign)
    902 {
    903   // RAX/RDX shouldn't be allocatable..
    904   DebugAssert(!(m_host_regs[Xbyak::Operand::RAX].flags & HR_USABLE) &&
    905               !(m_host_regs[Xbyak::Operand::RDX].flags & HR_USABLE));
    906 
    907   MoveSToReg(cg->eax, cf);
    908   if (cf.valid_host_t)
    909   {
    910     sign ? cg->imul(CFGetRegT(cf)) : cg->mul(CFGetRegT(cf));
    911   }
    912   else if (cf.const_t)
    913   {
    914     cg->mov(cg->edx, GetConstantRegU32(cf.MipsT()));
    915     sign ? cg->imul(cg->edx) : cg->mul(cg->edx);
    916   }
    917   else
    918   {
    919     sign ? cg->imul(MipsPtr(cf.MipsT())) : cg->mul(MipsPtr(cf.MipsT()));
    920   }
    921 
    922   // TODO: skip writeback if it's not needed
    923   if (cf.valid_host_lo)
    924     cg->mov(CFGetRegLO(cf), cg->eax);
    925   else
    926     cg->mov(MipsPtr(Reg::lo), cg->eax);
    927   if (cf.valid_host_lo)
    928     cg->mov(CFGetRegHI(cf), cg->edx);
    929   else
    930     cg->mov(MipsPtr(Reg::hi), cg->edx);
    931 }
    932 
    933 void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf)
    934 {
    935   Compile_mult(cf, true);
    936 }
    937 
    938 void CPU::NewRec::X64Compiler::Compile_multu(CompileFlags cf)
    939 {
    940   Compile_mult(cf, false);
    941 }
    942 
    943 void CPU::NewRec::X64Compiler::Compile_div(CompileFlags cf)
    944 {
    945   // not supported without registers for now..
    946   DebugAssert(cf.valid_host_lo && cf.valid_host_hi);
    947 
    948   const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx;
    949   if (!cf.valid_host_t)
    950     MoveTToReg(rt, cf);
    951 
    952   const Reg32 rlo = CFGetRegLO(cf);
    953   const Reg32 rhi = CFGetRegHI(cf);
    954 
    955   MoveSToReg(cg->eax, cf);
    956   cg->cdq();
    957 
    958   Label done;
    959   Label not_divide_by_zero;
    960   cg->test(rt, rt);
    961   cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT);
    962   cg->test(cg->eax, cg->eax);
    963   cg->mov(rhi, cg->eax); // hi = num
    964   cg->mov(rlo, 1);
    965   cg->mov(cg->eax, static_cast<u32>(-1));
    966   cg->cmovns(rlo, cg->eax); // lo = s >= 0 ? -1 : 1
    967   cg->jmp(done, CodeGenerator::T_SHORT);
    968 
    969   cg->L(not_divide_by_zero);
    970   Label not_unrepresentable;
    971   cg->cmp(cg->eax, 0x80000000u);
    972   cg->jne(not_unrepresentable, CodeGenerator::T_SHORT);
    973   cg->cmp(rt, static_cast<u32>(-1));
    974   cg->jne(not_unrepresentable, CodeGenerator::T_SHORT);
    975 
    976   cg->mov(rlo, 0x80000000u);
    977   cg->xor_(rhi, rhi);
    978   cg->jmp(done, CodeGenerator::T_SHORT);
    979 
    980   cg->L(not_unrepresentable);
    981 
    982   cg->idiv(rt);
    983   cg->mov(rlo, cg->eax);
    984   cg->mov(rhi, cg->edx);
    985 
    986   cg->L(done);
    987 }
    988 
    989 void CPU::NewRec::X64Compiler::Compile_divu(CompileFlags cf)
    990 {
    991   // not supported without registers for now..
    992   DebugAssert(cf.valid_host_lo && cf.valid_host_hi);
    993 
    994   const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx;
    995   if (!cf.valid_host_t)
    996     MoveTToReg(rt, cf);
    997 
    998   const Reg32 rlo = CFGetRegLO(cf);
    999   const Reg32 rhi = CFGetRegHI(cf);
   1000 
   1001   MoveSToReg(cg->eax, cf);
   1002   cg->xor_(cg->edx, cg->edx);
   1003 
   1004   Label done;
   1005   Label not_divide_by_zero;
   1006   cg->test(rt, rt);
   1007   cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT);
   1008   cg->mov(rlo, static_cast<u32>(-1));
   1009   cg->mov(rhi, cg->eax);
   1010   cg->jmp(done, CodeGenerator::T_SHORT);
   1011 
   1012   cg->L(not_divide_by_zero);
   1013   cg->div(rt);
   1014   cg->mov(rlo, cg->eax);
   1015   cg->mov(rhi, cg->edx);
   1016 
   1017   cg->L(done);
   1018 }
   1019 
   1020 void CPU::NewRec::X64Compiler::TestOverflow(const Xbyak::Reg32& result)
   1021 {
   1022   SwitchToFarCode(true, &Xbyak::CodeGenerator::jo);
   1023 
   1024   BackupHostState();
   1025 
   1026   // toss the result
   1027   ClearHostReg(result.getIdx());
   1028 
   1029   EndBlockWithException(Exception::Ov);
   1030 
   1031   RestoreHostState();
   1032 
   1033   SwitchToNearCode(false);
   1034 }
   1035 
   1036 void CPU::NewRec::X64Compiler::Compile_dst_op(
   1037   CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&),
   1038   void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, u32), bool commutative, bool overflow)
   1039 {
   1040   if (cf.valid_host_s && cf.valid_host_t)
   1041   {
   1042     if (cf.host_d == cf.host_s)
   1043     {
   1044       (cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
   1045     }
   1046     else if (cf.host_d == cf.host_t)
   1047     {
   1048       if (commutative)
   1049       {
   1050         (cg->*op)(CFGetRegD(cf), CFGetRegS(cf));
   1051       }
   1052       else
   1053       {
   1054         cg->mov(RWARG1, CFGetRegT(cf));
   1055         cg->mov(CFGetRegD(cf), CFGetRegS(cf));
   1056         (cg->*op)(CFGetRegD(cf), RWARG1);
   1057       }
   1058     }
   1059     else
   1060     {
   1061       cg->mov(CFGetRegD(cf), CFGetRegS(cf));
   1062       (cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
   1063     }
   1064   }
   1065   else if (commutative && (cf.const_s || cf.const_t))
   1066   {
   1067     const Reg32 rd = CFGetRegD(cf);
   1068     (cf.const_s) ? MoveTToReg(rd, cf) : MoveSToReg(rd, cf);
   1069     if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
   1070       (cg->*op_const)(CFGetRegD(cf), cv);
   1071     else
   1072       overflow = false;
   1073   }
   1074   else if (cf.const_s)
   1075   {
   1076     // need to backup T?
   1077     if (cf.valid_host_d && cf.valid_host_t && cf.host_d == cf.host_t)
   1078     {
   1079       cg->mov(RWARG1, CFGetRegT(cf));
   1080       MoveSToReg(CFGetRegD(cf), cf);
   1081       (cg->*op)(CFGetRegD(cf), RWARG1);
   1082     }
   1083     else
   1084     {
   1085       MoveSToReg(CFGetRegD(cf), cf);
   1086       (cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
   1087     }
   1088   }
   1089   else if (cf.const_t)
   1090   {
   1091     MoveSToReg(CFGetRegD(cf), cf);
   1092     if (const u32 cv = GetConstantRegU32(cf.MipsT()); cv != 0)
   1093       (cg->*op_const)(CFGetRegD(cf), cv);
   1094     else
   1095       overflow = false;
   1096   }
   1097   else if (cf.valid_host_s)
   1098   {
   1099     if (cf.host_d != cf.host_s)
   1100       cg->mov(CFGetRegD(cf), CFGetRegS(cf));
   1101     (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT()));
   1102   }
   1103   else if (cf.valid_host_t)
   1104   {
   1105     if (cf.host_d != cf.host_t)
   1106       cg->mov(CFGetRegD(cf), CFGetRegT(cf));
   1107     (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsS()));
   1108   }
   1109   else
   1110   {
   1111     cg->mov(CFGetRegD(cf), MipsPtr(cf.MipsS()));
   1112     (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT()));
   1113   }
   1114 
   1115   if (overflow)
   1116   {
   1117     DebugAssert(cf.valid_host_d);
   1118     TestOverflow(CFGetRegD(cf));
   1119   }
   1120 }
   1121 
   1122 void CPU::NewRec::X64Compiler::Compile_add(CompileFlags cf)
   1123 {
   1124   Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, g_settings.cpu_recompiler_memory_exceptions);
   1125 }
   1126 
   1127 void CPU::NewRec::X64Compiler::Compile_addu(CompileFlags cf)
   1128 {
   1129   Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, false);
   1130 }
   1131 
   1132 void CPU::NewRec::X64Compiler::Compile_sub(CompileFlags cf)
   1133 {
   1134   Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, g_settings.cpu_recompiler_memory_exceptions);
   1135 }
   1136 
   1137 void CPU::NewRec::X64Compiler::Compile_subu(CompileFlags cf)
   1138 {
   1139   Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, false);
   1140 }
   1141 
   1142 void CPU::NewRec::X64Compiler::Compile_and(CompileFlags cf)
   1143 {
   1144   // special cases - and with self -> self, and with 0 -> 0
   1145   const Reg32 regd = CFGetRegD(cf);
   1146   if (cf.MipsS() == cf.MipsT())
   1147   {
   1148     MoveSToReg(regd, cf);
   1149     return;
   1150   }
   1151   else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
   1152   {
   1153     cg->xor_(regd, regd);
   1154     return;
   1155   }
   1156 
   1157   Compile_dst_op(cf, &CodeGenerator::and_, &CodeGenerator::and_, true, false);
   1158 }
   1159 
   1160 void CPU::NewRec::X64Compiler::Compile_or(CompileFlags cf)
   1161 {
   1162   // or/nor with 0 -> no effect
   1163   const Reg32 regd = CFGetRegD(cf);
   1164   if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
   1165   {
   1166     cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
   1167     return;
   1168   }
   1169 
   1170   Compile_dst_op(cf, &CodeGenerator::or_, &CodeGenerator::or_, true, false);
   1171 }
   1172 
   1173 void CPU::NewRec::X64Compiler::Compile_xor(CompileFlags cf)
   1174 {
   1175   const Reg32 regd = CFGetRegD(cf);
   1176   if (cf.MipsS() == cf.MipsT())
   1177   {
   1178     // xor with self -> zero
   1179     cg->xor_(regd, regd);
   1180     return;
   1181   }
   1182   else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
   1183   {
   1184     // xor with zero -> no effect
   1185     cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
   1186     return;
   1187   }
   1188 
   1189   Compile_dst_op(cf, &CodeGenerator::xor_, &CodeGenerator::xor_, true, false);
   1190 }
   1191 
   1192 void CPU::NewRec::X64Compiler::Compile_nor(CompileFlags cf)
   1193 {
   1194   Compile_or(cf);
   1195   cg->not_(CFGetRegD(cf));
   1196 }
   1197 
   1198 void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf)
   1199 {
   1200   Compile_slt(cf, true);
   1201 }
   1202 
   1203 void CPU::NewRec::X64Compiler::Compile_sltu(CompileFlags cf)
   1204 {
   1205   Compile_slt(cf, false);
   1206 }
   1207 
   1208 void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf, bool sign)
   1209 {
   1210   const Reg32 rd = CFGetRegD(cf);
   1211   const Reg32 rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
   1212   const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
   1213   if (!cf.valid_host_s)
   1214     MoveSToReg(rs, cf);
   1215 
   1216   // Case where D == S, can't use xor because it changes flags
   1217   // TODO: swap and reverse op for constants
   1218   if (rd != rs && rd != rt)
   1219     cg->xor_(rd, rd);
   1220 
   1221   if (cf.valid_host_t)
   1222     cg->cmp(rs, CFGetRegT(cf));
   1223   else if (cf.const_t)
   1224     cg->cmp(rs, GetConstantRegU32(cf.MipsT()));
   1225   else
   1226     cg->cmp(rs, MipsPtr(cf.MipsT()));
   1227 
   1228   if (rd == rs || rd == rt)
   1229     cg->mov(rd, 0);
   1230 
   1231   sign ? cg->setl(rd.cvt8()) : cg->setb(rd.cvt8());
   1232 }
   1233 
   1234 Xbyak::Reg32
   1235 CPU::NewRec::X64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf,
   1236                                                      const std::optional<VirtualMemoryAddress>& address,
   1237                                                      const std::optional<const Xbyak::Reg32>& reg /* = std::nullopt */)
   1238 {
   1239   const u32 imm = inst->i.imm_sext32();
   1240   if (cf.valid_host_s && imm == 0 && !reg.has_value())
   1241     return CFGetRegS(cf);
   1242 
   1243   const Reg32 dst = reg.has_value() ? reg.value() : RWARG1;
   1244   if (address.has_value())
   1245   {
   1246     cg->mov(dst, address.value());
   1247   }
   1248   else
   1249   {
   1250     if (cf.valid_host_s)
   1251     {
   1252       if (const Reg32 src = CFGetRegS(cf); src != dst)
   1253         cg->mov(dst, CFGetRegS(cf));
   1254     }
   1255     else
   1256     {
   1257       cg->mov(dst, MipsPtr(cf.MipsS()));
   1258     }
   1259 
   1260     if (imm != 0)
   1261       cg->add(dst, inst->i.imm_sext32());
   1262   }
   1263 
   1264   return dst;
   1265 }
   1266 
   1267 template<typename RegAllocFn>
   1268 Xbyak::Reg32 CPU::NewRec::X64Compiler::GenerateLoad(const Xbyak::Reg32& addr_reg, MemoryAccessSize size, bool sign,
   1269                                                     bool use_fastmem, const RegAllocFn& dst_reg_alloc)
   1270 {
   1271   if (use_fastmem)
   1272   {
   1273     m_cycles += Bus::RAM_READ_TICKS;
   1274 
   1275     const Reg32 dst = dst_reg_alloc();
   1276 
   1277     if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
   1278     {
   1279       DebugAssert(addr_reg != RWARG3);
   1280       cg->mov(RWARG3, addr_reg.cvt32());
   1281       cg->shr(RXARG3, Bus::FASTMEM_LUT_PAGE_SHIFT);
   1282       cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]);
   1283     }
   1284 
   1285     const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE;
   1286     u8* start = cg->getCurr<u8*>();
   1287     switch (size)
   1288     {
   1289       case MemoryAccessSize::Byte:
   1290       {
   1291         sign ? cg->movsx(dst, cg->byte[membase + addr_reg.cvt64()]) :
   1292                cg->movzx(dst, cg->byte[membase + addr_reg.cvt64()]);
   1293       }
   1294       break;
   1295 
   1296       case MemoryAccessSize::HalfWord:
   1297       {
   1298         sign ? cg->movsx(dst, cg->word[membase + addr_reg.cvt64()]) :
   1299                cg->movzx(dst, cg->word[membase + addr_reg.cvt64()]);
   1300       }
   1301       break;
   1302 
   1303       case MemoryAccessSize::Word:
   1304       {
   1305         cg->mov(dst, cg->word[membase + addr_reg.cvt64()]);
   1306       }
   1307       break;
   1308     }
   1309 
   1310     u8* end = cg->getCurr<u8*>();
   1311     while ((end - start) < BACKPATCH_JMP_SIZE)
   1312     {
   1313       cg->nop();
   1314       end = cg->getCurr<u8*>();
   1315     }
   1316 
   1317     AddLoadStoreInfo(start, static_cast<u32>(end - start), static_cast<u32>(addr_reg.getIdx()),
   1318                      static_cast<u32>(dst.getIdx()), size, sign, true);
   1319     return dst;
   1320   }
   1321 
   1322   if (addr_reg != RWARG1)
   1323     cg->mov(RWARG1, addr_reg);
   1324 
   1325   const bool checked = g_settings.cpu_recompiler_memory_exceptions;
   1326   switch (size)
   1327   {
   1328     case MemoryAccessSize::Byte:
   1329     {
   1330       cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryByte) :
   1331                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte));
   1332     }
   1333     break;
   1334     case MemoryAccessSize::HalfWord:
   1335     {
   1336       cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryHalfWord) :
   1337                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord));
   1338     }
   1339     break;
   1340     case MemoryAccessSize::Word:
   1341     {
   1342       cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryWord) :
   1343                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord));
   1344     }
   1345     break;
   1346   }
   1347 
   1348   // TODO: turn this into an asm function instead
   1349   if (checked)
   1350   {
   1351     cg->test(RXRET, RXRET);
   1352 
   1353     BackupHostState();
   1354     SwitchToFarCode(true, &CodeGenerator::js);
   1355 
   1356     // flush regs, but not pc, it's going to get overwritten
   1357     // flush cycles because of the GTE instruction stuff...
   1358     Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
   1359 
   1360     // cause_bits = (-result << 2) | BD | cop_n
   1361     cg->mov(RWARG1, RWRET);
   1362     cg->neg(RWARG1);
   1363     cg->shl(RWARG1, 2);
   1364     cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException(
   1365                       static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
   1366     cg->mov(RWARG2, m_current_instruction_pc);
   1367     cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException));
   1368     m_dirty_pc = false;
   1369     EndAndLinkBlock(std::nullopt, true, false);
   1370 
   1371     SwitchToNearCode(false);
   1372     RestoreHostState();
   1373   }
   1374 
   1375   const Xbyak::Reg32 dst_reg = dst_reg_alloc();
   1376   switch (size)
   1377   {
   1378     case MemoryAccessSize::Byte:
   1379     {
   1380       sign ? cg->movsx(dst_reg, RWRET.cvt8()) : cg->movzx(dst_reg, RWRET.cvt8());
   1381     }
   1382     break;
   1383     case MemoryAccessSize::HalfWord:
   1384     {
   1385       sign ? cg->movsx(dst_reg, RWRET.cvt16()) : cg->movzx(dst_reg, RWRET.cvt16());
   1386     }
   1387     break;
   1388     case MemoryAccessSize::Word:
   1389     {
   1390       if (dst_reg != RWRET)
   1391         cg->mov(dst_reg, RWRET);
   1392     }
   1393     break;
   1394   }
   1395 
   1396   return dst_reg;
   1397 }
   1398 
   1399 void CPU::NewRec::X64Compiler::GenerateStore(const Xbyak::Reg32& addr_reg, const Xbyak::Reg32& value_reg,
   1400                                              MemoryAccessSize size, bool use_fastmem)
   1401 {
   1402   if (use_fastmem)
   1403   {
   1404     if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
   1405     {
   1406       DebugAssert(addr_reg != RWARG3 && value_reg != RWARG3);
   1407       cg->mov(RWARG3, addr_reg.cvt32());
   1408       cg->shr(RXARG3, Bus::FASTMEM_LUT_PAGE_SHIFT);
   1409       cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]);
   1410     }
   1411 
   1412     const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE;
   1413     u8* start = cg->getCurr<u8*>();
   1414     switch (size)
   1415     {
   1416       case MemoryAccessSize::Byte:
   1417         cg->mov(cg->byte[membase + addr_reg.cvt64()], value_reg.cvt8());
   1418         break;
   1419 
   1420       case MemoryAccessSize::HalfWord:
   1421         cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt16());
   1422         break;
   1423 
   1424       case MemoryAccessSize::Word:
   1425         cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt32());
   1426         break;
   1427     }
   1428 
   1429     u8* end = cg->getCurr<u8*>();
   1430     while ((end - start) < BACKPATCH_JMP_SIZE)
   1431     {
   1432       cg->nop();
   1433       end = cg->getCurr<u8*>();
   1434     }
   1435 
   1436     AddLoadStoreInfo(start, static_cast<u32>(end - start), static_cast<u32>(addr_reg.getIdx()),
   1437                      static_cast<u32>(value_reg.getIdx()), size, false, false);
   1438     return;
   1439   }
   1440 
   1441   if (addr_reg != RWARG1)
   1442     cg->mov(RWARG1, addr_reg);
   1443   if (value_reg != RWARG2)
   1444     cg->mov(RWARG2, value_reg);
   1445 
   1446   const bool checked = g_settings.cpu_recompiler_memory_exceptions;
   1447   switch (size)
   1448   {
   1449     case MemoryAccessSize::Byte:
   1450     {
   1451       cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryByte) :
   1452                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
   1453     }
   1454     break;
   1455     case MemoryAccessSize::HalfWord:
   1456     {
   1457       cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryHalfWord) :
   1458                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
   1459     }
   1460     break;
   1461     case MemoryAccessSize::Word:
   1462     {
   1463       cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryWord) :
   1464                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
   1465     }
   1466     break;
   1467   }
   1468 
   1469   // TODO: turn this into an asm function instead
   1470   if (checked)
   1471   {
   1472     cg->test(RWRET, RWRET);
   1473 
   1474     BackupHostState();
   1475     SwitchToFarCode(true, &CodeGenerator::jnz);
   1476 
   1477     // flush regs, but not pc, it's going to get overwritten
   1478     // flush cycles because of the GTE instruction stuff...
   1479     Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
   1480 
   1481     // cause_bits = (result << 2) | BD | cop_n
   1482     cg->mov(RWARG1, RWRET);
   1483     cg->shl(RWARG1, 2);
   1484     cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException(
   1485                       static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
   1486     cg->mov(RWARG2, m_current_instruction_pc);
   1487     cg->call(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   1488     m_dirty_pc = false;
   1489     EndAndLinkBlock(std::nullopt, true, false);
   1490 
   1491     SwitchToNearCode(false);
   1492     RestoreHostState();
   1493   }
   1494 }
   1495 
   1496 void CPU::NewRec::X64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1497                                            const std::optional<VirtualMemoryAddress>& address)
   1498 {
   1499   const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
   1500                                           std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1501                                           std::optional<Reg32>();
   1502   FlushForLoadStore(address, false, use_fastmem);
   1503   const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1504 
   1505   const Reg32 data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {
   1506     if (cf.MipsT() == Reg::zero)
   1507       return RWRET;
   1508 
   1509     return Reg32(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   1510                                  EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT()));
   1511   });
   1512 
   1513   if (g_settings.gpu_pgxp_enable)
   1514   {
   1515     Flush(FLUSH_FOR_C_CALL);
   1516 
   1517     cg->mov(RWARG1, inst->bits);
   1518     cg->mov(RWARG2, addr);
   1519     cg->mov(RWARG3, data);
   1520     cg->call(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
   1521     FreeHostReg(addr_reg.value().getIdx());
   1522   }
   1523 }
   1524 
   1525 void CPU::NewRec::X64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1526                                            const std::optional<VirtualMemoryAddress>& address)
   1527 {
   1528   DebugAssert(size == MemoryAccessSize::Word && !sign);
   1529 
   1530   const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
   1531   FlushForLoadStore(address, false, use_fastmem);
   1532 
   1533   // TODO: if address is constant, this can be simplified..
   1534 
   1535   // If we're coming from another block, just flush the load delay and hope for the best..
   1536   if (m_load_delay_dirty)
   1537     UpdateLoadDelay();
   1538 
   1539   // We'd need to be careful here if we weren't overwriting it..
   1540   ComputeLoadStoreAddressArg(cf, address, addr);
   1541   cg->mov(RWARG1, addr);
   1542   cg->and_(RWARG1, ~0x3u);
   1543   GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
   1544 
   1545   if (inst->r.rt == Reg::zero)
   1546   {
   1547     FreeHostReg(addr.getIdx());
   1548     return;
   1549   }
   1550 
   1551   // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
   1552   // never written back. NOTE: can't trust T in cf because of the flush
   1553   const Reg rt = inst->r.rt;
   1554   Reg32 value;
   1555   if (m_load_delay_register == rt)
   1556   {
   1557     const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
   1558                                  AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
   1559                                  m_load_delay_value_register;
   1560     RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
   1561     value = Reg32(existing_ld_rt);
   1562   }
   1563   else
   1564   {
   1565     if constexpr (EMULATE_LOAD_DELAYS)
   1566     {
   1567       value = Reg32(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
   1568       if (HasConstantReg(rt))
   1569         cg->mov(value, GetConstantRegU32(rt));
   1570       else if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
   1571         cg->mov(value, Reg32(rtreg.value()));
   1572       else
   1573         cg->mov(value, MipsPtr(rt));
   1574     }
   1575     else
   1576     {
   1577       value = Reg32(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
   1578     }
   1579   }
   1580 
   1581   DebugAssert(value != cg->ecx);
   1582   cg->mov(cg->ecx, addr);
   1583   cg->and_(cg->ecx, 3);
   1584   cg->shl(cg->ecx, 3); // *8
   1585 
   1586   // TODO for other arch: reverse subtract
   1587   DebugAssert(RWARG2 != cg->ecx);
   1588   cg->mov(RWARG2, 24);
   1589   cg->sub(RWARG2, cg->ecx);
   1590 
   1591   if (inst->op == InstructionOp::lwl)
   1592   {
   1593     // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
   1594     // new_value = (value & mask) | (RWRET << (24 - shift));
   1595     cg->mov(RWARG3, 0xFFFFFFu);
   1596     cg->shr(RWARG3, cg->cl);
   1597     cg->and_(value, RWARG3);
   1598     cg->mov(cg->ecx, RWARG2);
   1599     cg->shl(RWRET, cg->cl);
   1600     cg->or_(value, RWRET);
   1601   }
   1602   else
   1603   {
   1604     // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
   1605     // new_value = (value & mask) | (RWRET >> shift);
   1606     cg->shr(RWRET, cg->cl);
   1607     cg->mov(RWARG3, 0xFFFFFF00u);
   1608     cg->mov(cg->ecx, RWARG2);
   1609     cg->shl(RWARG3, cg->cl);
   1610     cg->and_(value, RWARG3);
   1611     cg->or_(value, RWRET);
   1612   }
   1613 
   1614   FreeHostReg(addr.getIdx());
   1615 
   1616   if (g_settings.gpu_pgxp_enable)
   1617   {
   1618     Flush(FLUSH_FOR_C_CALL);
   1619 
   1620     DebugAssert(value != RWARG3);
   1621     cg->mov(RWARG3, value);
   1622     cg->mov(RWARG2, addr);
   1623     cg->and_(RWARG2, ~0x3u);
   1624     cg->mov(RWARG1, inst->bits);
   1625     cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LW));
   1626   }
   1627 }
   1628 
   1629 void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1630                                             const std::optional<VirtualMemoryAddress>& address)
   1631 {
   1632   const u32 index = static_cast<u32>(inst->r.rt.GetValue());
   1633   const auto [ptr, action] = GetGTERegisterPointer(index, true);
   1634   const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
   1635                                           std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1636                                           std::optional<Reg32>();
   1637   FlushForLoadStore(address, false, use_fastmem);
   1638   const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1639   const Reg32 value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
   1640     return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
   1641              Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) :
   1642              RWRET;
   1643   });
   1644 
   1645   switch (action)
   1646   {
   1647     case GTERegisterAccessAction::Ignore:
   1648     {
   1649       break;
   1650     }
   1651 
   1652     case GTERegisterAccessAction::Direct:
   1653     {
   1654       cg->mov(cg->dword[PTR(ptr)], value);
   1655       break;
   1656     }
   1657 
   1658     case GTERegisterAccessAction::SignExtend16:
   1659     {
   1660       cg->movsx(RWARG3, value.cvt16());
   1661       cg->mov(cg->dword[PTR(ptr)], RWARG3);
   1662       break;
   1663     }
   1664 
   1665     case GTERegisterAccessAction::ZeroExtend16:
   1666     {
   1667       cg->movzx(RWARG3, value.cvt16());
   1668       cg->mov(cg->dword[PTR(ptr)], RWARG3);
   1669       break;
   1670     }
   1671 
   1672     case GTERegisterAccessAction::CallHandler:
   1673     {
   1674       Flush(FLUSH_FOR_C_CALL);
   1675       cg->mov(RWARG2, value);
   1676       cg->mov(RWARG1, index);
   1677       cg->call(&GTE::WriteRegister);
   1678       break;
   1679     }
   1680 
   1681     case GTERegisterAccessAction::PushFIFO:
   1682     {
   1683       // SXY0 <- SXY1
   1684       // SXY1 <- SXY2
   1685       // SXY2 <- SXYP
   1686       DebugAssert(value != RWARG1 && value != RWARG2);
   1687       cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]);
   1688       cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]);
   1689       cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1);
   1690       cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2);
   1691       cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], value);
   1692       break;
   1693     }
   1694 
   1695     default:
   1696     {
   1697       Panic("Unknown action");
   1698       return;
   1699     }
   1700   }
   1701 
   1702   if (g_settings.gpu_pgxp_enable)
   1703   {
   1704     Flush(FLUSH_FOR_C_CALL);
   1705     cg->mov(RWARG3, value);
   1706     if (value != RWRET)
   1707       FreeHostReg(value.getIdx());
   1708     cg->mov(RWARG2, addr);
   1709     FreeHostReg(addr_reg.value().getIdx());
   1710     cg->mov(RWARG1, inst->bits);
   1711     cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
   1712   }
   1713 }
   1714 
   1715 void CPU::NewRec::X64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1716                                            const std::optional<VirtualMemoryAddress>& address)
   1717 {
   1718   const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
   1719                                           std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1720                                           std::optional<Reg32>();
   1721   FlushForLoadStore(address, true, use_fastmem);
   1722   const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1723   const Reg32 data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
   1724   if (!cf.valid_host_t)
   1725     MoveTToReg(RWARG2, cf);
   1726 
   1727   GenerateStore(addr, data, size, use_fastmem);
   1728 
   1729   if (g_settings.gpu_pgxp_enable)
   1730   {
   1731     Flush(FLUSH_FOR_C_CALL);
   1732     MoveMIPSRegToReg(RWARG3, cf.MipsT());
   1733     cg->mov(RWARG2, addr);
   1734     cg->mov(RWARG1, inst->bits);
   1735     cg->call(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
   1736     FreeHostReg(addr_reg.value().getIdx());
   1737   }
   1738 }
   1739 
   1740 void CPU::NewRec::X64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1741                                            const std::optional<VirtualMemoryAddress>& address)
   1742 {
   1743   DebugAssert(size == MemoryAccessSize::Word && !sign);
   1744 
   1745   // TODO: this can take over rt's value if it's no longer needed
   1746   // NOTE: can't trust T in cf because of the alloc
   1747   const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
   1748   const Reg32 value = g_settings.gpu_pgxp_enable ? Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
   1749   if (g_settings.gpu_pgxp_enable)
   1750     MoveMIPSRegToReg(value, inst->r.rt);
   1751 
   1752   FlushForLoadStore(address, true, use_fastmem);
   1753 
   1754   // TODO: if address is constant, this can be simplified..
   1755   // We'd need to be careful here if we weren't overwriting it..
   1756   ComputeLoadStoreAddressArg(cf, address, addr);
   1757   cg->mov(RWARG1, addr);
   1758   cg->and_(RWARG1, ~0x3u);
   1759   GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
   1760 
   1761   DebugAssert(value != cg->ecx);
   1762   cg->mov(cg->ecx, addr);
   1763   cg->and_(cg->ecx, 3);
   1764   cg->shl(cg->ecx, 3); // *8
   1765   cg->and_(addr, ~0x3u);
   1766 
   1767   // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
   1768   if (!g_settings.gpu_pgxp_enable)
   1769     MoveMIPSRegToReg(value, inst->r.rt);
   1770 
   1771   if (inst->op == InstructionOp::swl)
   1772   {
   1773     // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
   1774     // new_value = (RWRET & mem_mask) | (value >> (24 - shift));
   1775     cg->mov(RWARG3, 0xFFFFFF00u);
   1776     cg->shl(RWARG3, cg->cl);
   1777     cg->and_(RWRET, RWARG3);
   1778 
   1779     cg->mov(RWARG3, 24);
   1780     cg->sub(RWARG3, cg->ecx);
   1781     cg->mov(cg->ecx, RWARG3);
   1782     cg->shr(value, cg->cl);
   1783     cg->or_(value, RWRET);
   1784   }
   1785   else
   1786   {
   1787     // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
   1788     // new_value = (RWRET & mem_mask) | (value << shift);
   1789     cg->shl(value, cg->cl);
   1790 
   1791     DebugAssert(RWARG3 != cg->ecx);
   1792     cg->mov(RWARG3, 24);
   1793     cg->sub(RWARG3, cg->ecx);
   1794     cg->mov(cg->ecx, RWARG3);
   1795     cg->mov(RWARG3, 0x00FFFFFFu);
   1796     cg->shr(RWARG3, cg->cl);
   1797     cg->and_(RWRET, RWARG3);
   1798     cg->or_(value, RWRET);
   1799   }
   1800 
   1801   if (!g_settings.gpu_pgxp_enable)
   1802   {
   1803     GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
   1804     FreeHostReg(addr.getIdx());
   1805   }
   1806   else
   1807   {
   1808     GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
   1809 
   1810     Flush(FLUSH_FOR_C_CALL);
   1811     cg->mov(RWARG3, value);
   1812     FreeHostReg(value.getIdx());
   1813     cg->mov(RWARG2, addr);
   1814     FreeHostReg(addr.getIdx());
   1815     cg->mov(RWARG1, inst->bits);
   1816     cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SW));
   1817   }
   1818 }
   1819 
   1820 void CPU::NewRec::X64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1821                                             const std::optional<VirtualMemoryAddress>& address)
   1822 {
   1823   const u32 index = static_cast<u32>(inst->r.rt.GetValue());
   1824   const auto [ptr, action] = GetGTERegisterPointer(index, false);
   1825   switch (action)
   1826   {
   1827     case GTERegisterAccessAction::Direct:
   1828     {
   1829       cg->mov(RWARG2, cg->dword[PTR(ptr)]);
   1830     }
   1831     break;
   1832 
   1833     case GTERegisterAccessAction::CallHandler:
   1834     {
   1835       // should already be flushed.. except in fastmem case
   1836       Flush(FLUSH_FOR_C_CALL);
   1837       cg->mov(RWARG1, index);
   1838       cg->call(&GTE::ReadRegister);
   1839       cg->mov(RWARG2, RWRET);
   1840     }
   1841     break;
   1842 
   1843     default:
   1844     {
   1845       Panic("Unknown action");
   1846     }
   1847     break;
   1848   }
   1849 
   1850   // PGXP makes this a giant pain.
   1851   if (!g_settings.gpu_pgxp_enable)
   1852   {
   1853     FlushForLoadStore(address, true, use_fastmem);
   1854     const Reg32 addr = ComputeLoadStoreAddressArg(cf, address);
   1855     GenerateStore(addr, RWARG2, size, use_fastmem);
   1856     return;
   1857   }
   1858 
   1859   // TODO: This can be simplified because we don't need to validate in PGXP..
   1860   const Reg32 addr_reg = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
   1861   const Reg32 data_backup = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
   1862   FlushForLoadStore(address, true, use_fastmem);
   1863   ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1864   cg->mov(data_backup, RWARG2);
   1865   GenerateStore(addr_reg, RWARG2, size, use_fastmem);
   1866 
   1867   Flush(FLUSH_FOR_C_CALL);
   1868   cg->mov(RWARG3, data_backup);
   1869   cg->mov(RWARG2, addr_reg);
   1870   cg->mov(RWARG1, inst->bits);
   1871   cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
   1872   FreeHostReg(addr_reg.getIdx());
   1873   FreeHostReg(data_backup.getIdx());
   1874 }
   1875 
   1876 void CPU::NewRec::X64Compiler::Compile_mtc0(CompileFlags cf)
   1877 {
   1878   const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
   1879   const u32* ptr = GetCop0RegPtr(reg);
   1880   const u32 mask = GetCop0RegWriteMask(reg);
   1881   if (!ptr)
   1882   {
   1883     Compile_Fallback();
   1884     return;
   1885   }
   1886 
   1887   // TODO: const apply mask
   1888   const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
   1889   const u32 constant_value = cf.const_t ? GetConstantRegU32(cf.MipsT()) : 0;
   1890   if (mask == 0)
   1891   {
   1892     // if it's a read-only register, ignore
   1893     DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
   1894     return;
   1895   }
   1896 
   1897   // for some registers, we need to test certain bits
   1898   const bool needs_bit_test = (reg == Cop0Reg::SR);
   1899   const Reg32 changed_bits = RWARG3;
   1900 
   1901   // update value
   1902   if (cf.valid_host_t)
   1903   {
   1904     cg->mov(RWARG1, rt);
   1905     cg->mov(RWARG2, cg->dword[PTR(ptr)]);
   1906     cg->and_(RWARG1, mask);
   1907     if (needs_bit_test)
   1908     {
   1909       cg->mov(changed_bits, RWARG2);
   1910       cg->xor_(changed_bits, RWARG1);
   1911     }
   1912     cg->and_(RWARG2, ~mask);
   1913     cg->or_(RWARG2, RWARG1);
   1914     cg->mov(cg->dword[PTR(ptr)], RWARG2);
   1915   }
   1916   else
   1917   {
   1918     cg->mov(RWARG2, cg->dword[PTR(ptr)]);
   1919     if (needs_bit_test)
   1920     {
   1921       cg->mov(changed_bits, RWARG2);
   1922       cg->xor_(changed_bits, constant_value & mask);
   1923     }
   1924     cg->and_(RWARG2, ~mask);
   1925     cg->or_(RWARG2, constant_value & mask);
   1926     cg->mov(cg->dword[PTR(ptr)], RWARG2);
   1927   }
   1928 
   1929   if (reg == Cop0Reg::SR)
   1930   {
   1931     // TODO: replace with register backup
   1932     // We could just inline the whole thing..
   1933     Flush(FLUSH_FOR_C_CALL);
   1934 
   1935     cg->test(changed_bits, 1u << 16);
   1936     SwitchToFarCode(true, &CodeGenerator::jnz);
   1937     cg->mov(cg->dword[cg->rsp], RWARG2);
   1938     cg->sub(cg->rsp, STACK_SHADOW_SIZE + 8);
   1939     cg->call(&CPU::UpdateMemoryPointers);
   1940     cg->add(cg->rsp, STACK_SHADOW_SIZE + 8);
   1941     cg->mov(RWARG2, cg->dword[cg->rsp]);
   1942     cg->mov(RMEMBASE, cg->qword[PTR(&g_state.fastmem_base)]);
   1943     SwitchToNearCode(true);
   1944 
   1945     TestInterrupts(RWARG2);
   1946   }
   1947   else if (reg == Cop0Reg::CAUSE)
   1948   {
   1949     cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]);
   1950     TestInterrupts(RWARG1);
   1951   }
   1952 
   1953   if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions)
   1954   {
   1955     // TODO: DCIC handling for debug breakpoints
   1956     WARNING_LOG("TODO: DCIC handling for debug breakpoints");
   1957   }
   1958 }
   1959 
   1960 void CPU::NewRec::X64Compiler::Compile_rfe(CompileFlags cf)
   1961 {
   1962   // shift mode bits right two, preserving upper bits
   1963   static constexpr u32 mode_bits_mask = UINT32_C(0b1111);
   1964   cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]);
   1965   cg->mov(RWARG2, RWARG1);
   1966   cg->shr(RWARG2, 2);
   1967   cg->and_(RWARG1, ~mode_bits_mask);
   1968   cg->and_(RWARG2, mode_bits_mask);
   1969   cg->or_(RWARG1, RWARG2);
   1970   cg->mov(cg->dword[PTR(&g_state.cop0_regs.sr.bits)], RWARG1);
   1971 
   1972   TestInterrupts(RWARG1);
   1973 }
   1974 
   1975 void CPU::NewRec::X64Compiler::TestInterrupts(const Xbyak::Reg32& sr)
   1976 {
   1977   // if Iec == 0 then goto no_interrupt
   1978   Label no_interrupt;
   1979 
   1980   cg->test(sr, 1);
   1981   cg->jz(no_interrupt, CodeGenerator::T_NEAR);
   1982 
   1983   // sr & cause
   1984   cg->and_(sr, cg->dword[PTR(&g_state.cop0_regs.cause.bits)]);
   1985 
   1986   // ((sr & cause) & 0xff00) == 0 goto no_interrupt
   1987   cg->test(sr, 0xFF00);
   1988 
   1989   SwitchToFarCode(true, &CodeGenerator::jnz);
   1990   BackupHostState();
   1991 
   1992   // Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
   1993   UpdateLoadDelay();
   1994 
   1995   Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
   1996 
   1997   // Can't use EndBlockWithException() here, because it'll use the wrong PC.
   1998   // Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
   1999   if (!iinfo->is_last_instruction)
   2000   {
   2001     cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
   2002                                                                 (inst + 1)->cop.cop_n));
   2003     cg->mov(RWARG2, m_compiler_pc);
   2004     cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException));
   2005     m_dirty_pc = false;
   2006     EndAndLinkBlock(std::nullopt, true, false);
   2007   }
   2008   else
   2009   {
   2010     if (m_dirty_pc)
   2011       cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc);
   2012     m_dirty_pc = false;
   2013     cg->mov(cg->dword[PTR(&g_state.downcount)], 0);
   2014     EndAndLinkBlock(std::nullopt, false, true);
   2015   }
   2016 
   2017   RestoreHostState();
   2018   SwitchToNearCode(false);
   2019 
   2020   cg->L(no_interrupt);
   2021 }
   2022 
   2023 void CPU::NewRec::X64Compiler::Compile_mfc2(CompileFlags cf)
   2024 {
   2025   const u32 index = inst->cop.Cop2Index();
   2026   const Reg rt = inst->r.rt;
   2027 
   2028   const auto [ptr, action] = GetGTERegisterPointer(index, false);
   2029   if (action == GTERegisterAccessAction::Ignore)
   2030     return;
   2031 
   2032   u32 hreg;
   2033   if (action == GTERegisterAccessAction::Direct)
   2034   {
   2035     hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   2036                            EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
   2037     cg->mov(Reg32(hreg), cg->dword[PTR(ptr)]);
   2038   }
   2039   else if (action == GTERegisterAccessAction::CallHandler)
   2040   {
   2041     Flush(FLUSH_FOR_C_CALL);
   2042     cg->mov(RWARG1, index);
   2043     cg->call(&GTE::ReadRegister);
   2044 
   2045     hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   2046                            EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
   2047     cg->mov(Reg32(hreg), RWRET);
   2048   }
   2049   else
   2050   {
   2051     Panic("Unknown action");
   2052     return;
   2053   }
   2054 
   2055   if (g_settings.gpu_pgxp_enable)
   2056   {
   2057     Flush(FLUSH_FOR_C_CALL);
   2058     cg->mov(RWARG1, inst->bits);
   2059     cg->mov(RWARG2, Reg32(hreg));
   2060     cg->call(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
   2061   }
   2062 }
   2063 
   2064 void CPU::NewRec::X64Compiler::Compile_mtc2(CompileFlags cf)
   2065 {
   2066   const u32 index = inst->cop.Cop2Index();
   2067   const auto [ptr, action] = GetGTERegisterPointer(index, true);
   2068   if (action == GTERegisterAccessAction::Ignore)
   2069     return;
   2070 
   2071   if (action == GTERegisterAccessAction::Direct)
   2072   {
   2073     if (cf.const_t)
   2074     {
   2075       cg->mov(cg->dword[PTR(ptr)], GetConstantRegU32(cf.MipsT()));
   2076     }
   2077     else if (cf.valid_host_t)
   2078     {
   2079       cg->mov(cg->dword[PTR(ptr)], CFGetRegT(cf));
   2080     }
   2081     else
   2082     {
   2083       cg->mov(RWARG1, MipsPtr(cf.MipsT()));
   2084       cg->mov(cg->dword[PTR(ptr)], RWARG1);
   2085     }
   2086   }
   2087   else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
   2088   {
   2089     const bool sign = (action == GTERegisterAccessAction::SignExtend16);
   2090     if (cf.const_t)
   2091     {
   2092       const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
   2093       cg->mov(cg->dword[PTR(ptr)], sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv));
   2094     }
   2095     else if (cf.valid_host_t)
   2096     {
   2097       sign ? cg->movsx(RWARG1, Reg16(cf.host_t)) : cg->movzx(RWARG1, Reg16(cf.host_t));
   2098       cg->mov(cg->dword[PTR(ptr)], RWARG1);
   2099     }
   2100     else
   2101     {
   2102       sign ? cg->movsx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]) :
   2103              cg->movzx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]);
   2104       cg->mov(cg->dword[PTR(ptr)], RWARG1);
   2105     }
   2106   }
   2107   else if (action == GTERegisterAccessAction::CallHandler)
   2108   {
   2109     Flush(FLUSH_FOR_C_CALL);
   2110     cg->mov(RWARG1, index);
   2111     MoveTToReg(RWARG2, cf);
   2112     cg->call(&GTE::WriteRegister);
   2113   }
   2114   else if (action == GTERegisterAccessAction::PushFIFO)
   2115   {
   2116     // SXY0 <- SXY1
   2117     // SXY1 <- SXY2
   2118     // SXY2 <- SXYP
   2119     cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]);
   2120     cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]);
   2121     if (!cf.const_t && !cf.valid_host_t)
   2122       cg->mov(RWARG3, MipsPtr(cf.MipsT()));
   2123     cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1);
   2124     cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2);
   2125     if (cf.const_t)
   2126       cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], GetConstantRegU32(cf.MipsT()));
   2127     else if (cf.valid_host_t)
   2128       cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], CFGetRegT(cf));
   2129     else
   2130       cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWARG3);
   2131   }
   2132   else
   2133   {
   2134     Panic("Unknown action");
   2135   }
   2136 }
   2137 
   2138 void CPU::NewRec::X64Compiler::Compile_cop2(CompileFlags cf)
   2139 {
   2140   TickCount func_ticks;
   2141   GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
   2142 
   2143   Flush(FLUSH_FOR_C_CALL);
   2144   cg->mov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
   2145   cg->call(reinterpret_cast<const void*>(func));
   2146 
   2147   AddGTETicks(func_ticks);
   2148 }
   2149 
   2150 u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
   2151                                        TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
   2152                                        u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
   2153                                        bool is_load)
   2154 {
   2155   CodeGenerator acg(thunk_space, thunk_code);
   2156   CodeGenerator* cg = &acg;
   2157 
   2158   static constexpr u32 GPR_SIZE = 8;
   2159 
   2160   // save regs
   2161   u32 num_gprs = 0;
   2162 
   2163   for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2164   {
   2165     if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
   2166       num_gprs++;
   2167   }
   2168 
   2169   const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE) + STACK_SHADOW_SIZE;
   2170 
   2171   if (stack_size > 0)
   2172   {
   2173     cg->sub(cg->rsp, stack_size);
   2174 
   2175     u32 stack_offset = STACK_SHADOW_SIZE;
   2176     for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2177     {
   2178       if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
   2179       {
   2180         cg->mov(cg->qword[cg->rsp + stack_offset], Reg64(i));
   2181         stack_offset += GPR_SIZE;
   2182       }
   2183     }
   2184   }
   2185 
   2186   if (cycles_to_add != 0)
   2187     cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_add);
   2188 
   2189   if (address_register != static_cast<u8>(RWARG1.getIdx()))
   2190     cg->mov(RWARG1, Reg32(address_register));
   2191 
   2192   if (!is_load)
   2193   {
   2194     if (data_register != static_cast<u8>(RWARG2.getIdx()))
   2195       cg->mov(RWARG2, Reg32(data_register));
   2196   }
   2197 
   2198   switch (size)
   2199   {
   2200     case MemoryAccessSize::Byte:
   2201     {
   2202       cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte) :
   2203                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
   2204     }
   2205     break;
   2206     case MemoryAccessSize::HalfWord:
   2207     {
   2208       cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) :
   2209                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
   2210     }
   2211     break;
   2212     case MemoryAccessSize::Word:
   2213     {
   2214       cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord) :
   2215                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
   2216     }
   2217     break;
   2218   }
   2219 
   2220   if (is_load)
   2221   {
   2222     const Reg32 dst = Reg32(data_register);
   2223     switch (size)
   2224     {
   2225       case MemoryAccessSize::Byte:
   2226       {
   2227         is_signed ? cg->movsx(dst, RWRET.cvt8()) : cg->movzx(dst, RWRET.cvt8());
   2228       }
   2229       break;
   2230       case MemoryAccessSize::HalfWord:
   2231       {
   2232         is_signed ? cg->movsx(dst, RWRET.cvt16()) : cg->movzx(dst, RWRET.cvt16());
   2233       }
   2234       break;
   2235       case MemoryAccessSize::Word:
   2236       {
   2237         if (dst != RWRET)
   2238           cg->mov(dst, RWRET);
   2239       }
   2240       break;
   2241     }
   2242   }
   2243 
   2244   if (cycles_to_remove != 0)
   2245     cg->sub(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_remove);
   2246 
   2247   // restore regs
   2248   if (stack_size > 0)
   2249   {
   2250     u32 stack_offset = STACK_SHADOW_SIZE;
   2251     for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2252     {
   2253       if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
   2254       {
   2255         cg->mov(Reg64(i), cg->qword[cg->rsp + stack_offset]);
   2256         stack_offset += GPR_SIZE;
   2257       }
   2258     }
   2259 
   2260     cg->add(cg->rsp, stack_size);
   2261   }
   2262 
   2263   cg->jmp(static_cast<const u8*>(code_address) + code_size);
   2264 
   2265   // fill the rest of it with nops, if any
   2266   DebugAssert(code_size >= BACKPATCH_JMP_SIZE);
   2267   if (code_size > BACKPATCH_JMP_SIZE)
   2268     std::memset(static_cast<u8*>(code_address) + BACKPATCH_JMP_SIZE, 0x90, code_size - BACKPATCH_JMP_SIZE);
   2269 
   2270   return static_cast<u32>(cg->getSize());
   2271 }
   2272 
   2273 #endif // CPU_ARCH_X64