duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

cpu_newrec_compiler_riscv64.cpp (77586B)


      1 // SPDX-FileCopyrightText: 2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "cpu_newrec_compiler_riscv64.h"
      5 #include "cpu_code_cache_private.h"
      6 #include "cpu_core_private.h"
      7 #include "cpu_pgxp.h"
      8 #include "cpu_recompiler_thunks.h"
      9 #include "gte.h"
     10 #include "settings.h"
     11 #include "timing_event.h"
     12 
     13 #include "common/align.h"
     14 #include "common/assert.h"
     15 #include "common/log.h"
     16 #include "common/memmap.h"
     17 #include "common/string_util.h"
     18 
     19 #include <limits>
     20 
     21 #ifdef CPU_ARCH_RISCV64
     22 
     23 Log_SetChannel(CPU::NewRec);
     24 
     25 #ifdef ENABLE_HOST_DISASSEMBLY
     26 extern "C" {
     27 #include "riscv-disas.h"
     28 }
     29 #endif
     30 
     31 // For LW/SW/etc.
     32 #define PTR(x) ((u32)(((u8*)(x)) - ((u8*)&g_state))), RSTATE
     33 
     34 static constexpr u32 BLOCK_LINK_SIZE = 8; // auipc+jr
     35 
     36 namespace CPU::NewRec {
     37 
     38 using namespace biscuit;
     39 
     40 using CPU::Recompiler::rvEmitCall;
     41 using CPU::Recompiler::rvEmitDSExtW;
     42 using CPU::Recompiler::rvEmitDUExtW;
     43 using CPU::Recompiler::rvEmitFarLoad;
     44 using CPU::Recompiler::rvEmitJmp;
     45 using CPU::Recompiler::rvEmitMov;
     46 using CPU::Recompiler::rvEmitMov64;
     47 using CPU::Recompiler::rvEmitSExtB;
     48 using CPU::Recompiler::rvEmitSExtH;
     49 using CPU::Recompiler::rvEmitUExtB;
     50 using CPU::Recompiler::rvEmitUExtH;
     51 using CPU::Recompiler::rvGetAddressImmediates;
     52 using CPU::Recompiler::rvIsCallerSavedRegister;
     53 using CPU::Recompiler::rvIsValidSExtITypeImm;
     54 using CPU::Recompiler::rvMoveAddressToReg;
     55 
     56 RISCV64Compiler s_instance;
     57 Compiler* g_compiler = &s_instance;
     58 
     59 } // namespace CPU::NewRec
     60 
     61 bool CPU::Recompiler::rvIsCallerSavedRegister(u32 id)
     62 {
     63   return (id == 1 || (id >= 3 && id < 8) || (id >= 10 && id <= 17) || (id >= 28 && id <= 31));
     64 }
     65 
     66 bool CPU::Recompiler::rvIsValidSExtITypeImm(u32 imm)
     67 {
     68   return (static_cast<u32>((static_cast<s32>(imm) << 20) >> 20) == imm);
     69 }
     70 
     71 std::pair<s32, s32> CPU::Recompiler::rvGetAddressImmediates(const void* cur, const void* target)
     72 {
     73   const s64 disp = static_cast<s64>(reinterpret_cast<intptr_t>(target) - reinterpret_cast<intptr_t>(cur));
     74   Assert(disp >= static_cast<s64>(std::numeric_limits<s32>::min()) &&
     75          disp <= static_cast<s64>(std::numeric_limits<s32>::max()));
     76 
     77   const s64 hi = disp + 0x800;
     78   const s64 lo = disp - (hi & 0xFFFFF000);
     79   return std::make_pair(static_cast<s32>(hi >> 12), static_cast<s32>((lo << 52) >> 52));
     80 }
     81 
     82 void CPU::Recompiler::rvMoveAddressToReg(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr)
     83 {
     84   const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), addr);
     85   rvAsm->AUIPC(reg, hi);
     86   rvAsm->ADDI(reg, reg, lo);
     87 }
     88 
     89 void CPU::Recompiler::rvEmitMov(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, u32 imm)
     90 {
     91   // Borrowed from biscuit, but doesn't emit an ADDI if the lower 12 bits are zero.
     92   const u32 lower = imm & 0xFFF;
     93   const u32 upper = (imm & 0xFFFFF000) >> 12;
     94   const s32 simm = static_cast<s32>(imm);
     95   if (rvIsValidSExtITypeImm(simm))
     96   {
     97     rvAsm->ADDI(rd, biscuit::zero, static_cast<s32>(lower));
     98   }
     99   else
    100   {
    101     const bool needs_increment = (lower & 0x800) != 0;
    102     const u32 upper_imm = needs_increment ? upper + 1 : upper;
    103     rvAsm->LUI(rd, upper_imm);
    104     rvAsm->ADDI(rd, rd, static_cast<int32_t>(lower));
    105   }
    106 }
    107 
    108 void CPU::Recompiler::rvEmitMov64(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& scratch,
    109                                   u64 imm)
    110 {
    111   // TODO: Make better..
    112   rvEmitMov(rvAsm, rd, static_cast<u32>(imm >> 32));
    113   rvEmitMov(rvAsm, scratch, static_cast<u32>(imm));
    114   rvAsm->SLLI64(rd, rd, 32);
    115   rvAsm->SLLI64(scratch, scratch, 32);
    116   rvAsm->SRLI64(scratch, scratch, 32);
    117   rvAsm->ADD(rd, rd, scratch);
    118 }
    119 
    120 u32 CPU::Recompiler::rvEmitJmp(biscuit::Assembler* rvAsm, const void* ptr, const biscuit::GPR& link_reg)
    121 {
    122   // TODO: use J if displacement is <1MB,  needs a bool because backpatch must be 8 bytes
    123   const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), ptr);
    124   rvAsm->AUIPC(RSCRATCH, hi);
    125   rvAsm->JALR(link_reg, lo, RSCRATCH);
    126   return 8;
    127 }
    128 
    129 u32 CPU::Recompiler::rvEmitCall(biscuit::Assembler* rvAsm, const void* ptr)
    130 {
    131   return rvEmitJmp(rvAsm, ptr, biscuit::ra);
    132 }
    133 
    134 void CPU::Recompiler::rvEmitFarLoad(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr,
    135                                     bool sign_extend_word)
    136 {
    137   const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), addr);
    138   rvAsm->AUIPC(reg, hi);
    139   if (sign_extend_word)
    140     rvAsm->LW(reg, lo, reg);
    141   else
    142     rvAsm->LWU(reg, lo, reg);
    143 }
    144 
    145 void CPU::Recompiler::rvEmitFarStore(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr,
    146                                      const biscuit::GPR& tempreg)
    147 {
    148   const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), addr);
    149   rvAsm->AUIPC(tempreg, hi);
    150   rvAsm->SW(reg, lo, tempreg);
    151 }
    152 
    153 void CPU::Recompiler::rvEmitSExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
    154 {
    155   rvAsm->SLLI(rd, rs, 24);
    156   rvAsm->SRAIW(rd, rd, 24);
    157 }
    158 
    159 void CPU::Recompiler::rvEmitUExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
    160 {
    161   rvAsm->ANDI(rd, rs, 0xFF);
    162 }
    163 
    164 void CPU::Recompiler::rvEmitSExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
    165 {
    166   rvAsm->SLLI(rd, rs, 16);
    167   rvAsm->SRAIW(rd, rd, 16);
    168 }
    169 
    170 void CPU::Recompiler::rvEmitUExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
    171 {
    172   rvAsm->SLLI(rd, rs, 16);
    173   rvAsm->SRLI(rd, rd, 16);
    174 }
    175 
    176 void CPU::Recompiler::rvEmitDSExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
    177 {
    178   rvAsm->ADDIW(rd, rs, 0);
    179 }
    180 
    181 void CPU::Recompiler::rvEmitDUExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
    182 {
    183   rvAsm->SLLI64(rd, rs, 32);
    184   rvAsm->SRLI64(rd, rd, 32);
    185 }
    186 
    187 void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
    188 {
    189 #ifdef ENABLE_HOST_DISASSEMBLY
    190   const u8* cur = static_cast<const u8*>(start);
    191   const u8* end = cur + size;
    192   char buf[256];
    193   while (cur < end)
    194   {
    195     rv_inst inst;
    196     size_t instlen;
    197     inst_fetch(cur, &inst, &instlen);
    198     disasm_inst(buf, std::size(buf), rv64, static_cast<u64>(reinterpret_cast<uintptr_t>(cur)), inst);
    199     DEBUG_LOG("\t0x{:016X}\t{}", static_cast<u64>(reinterpret_cast<uintptr_t>(cur)), buf);
    200     cur += instlen;
    201   }
    202 #else
    203   ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
    204 #endif
    205 }
    206 
    207 u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
    208 {
    209 #ifdef ENABLE_HOST_DISASSEMBLY
    210   const u8* cur = static_cast<const u8*>(start);
    211   const u8* end = cur + size;
    212   u32 icount = 0;
    213   while (cur < end)
    214   {
    215     rv_inst inst;
    216     size_t instlen;
    217     inst_fetch(cur, &inst, &instlen);
    218     cur += instlen;
    219     icount++;
    220   }
    221   return icount;
    222 #else
    223   ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
    224   return 0;
    225 #endif
    226 }
    227 
    228 u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
    229 {
    230   using namespace CPU::Recompiler;
    231   using namespace biscuit;
    232 
    233   Assembler actual_asm(static_cast<u8*>(code), code_size);
    234   Assembler* rvAsm = &actual_asm;
    235 
    236   Label dispatch;
    237 
    238   g_enter_recompiler = reinterpret_cast<decltype(g_enter_recompiler)>(rvAsm->GetCursorPointer());
    239   {
    240     // TODO: reserve some space for saving caller-saved registers
    241 
    242     // Need the CPU state for basically everything :-)
    243     rvMoveAddressToReg(rvAsm, RSTATE, &g_state);
    244 
    245     // Fastmem setup
    246     if (IsUsingFastmem())
    247       rvAsm->LD(RMEMBASE, PTR(&g_state.fastmem_base));
    248 
    249     // Downcount isn't set on entry, so we need to initialize it
    250     rvMoveAddressToReg(rvAsm, RARG1, TimingEvents::GetHeadEventPtr());
    251     rvAsm->LD(RARG1, 0, RARG1);
    252     rvAsm->LW(RARG1, OFFSETOF(TimingEvent, m_downcount), RARG1);
    253     rvAsm->SW(RARG1, PTR(&g_state.downcount));
    254 
    255     // Fall through to event dispatcher
    256   }
    257 
    258   // check events then for frame done
    259   g_check_events_and_dispatch = rvAsm->GetCursorPointer();
    260   {
    261     Label skip_event_check;
    262     rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
    263     rvAsm->LW(RARG2, PTR(&g_state.downcount));
    264     rvAsm->BLTU(RARG1, RARG2, &skip_event_check);
    265 
    266     g_run_events_and_dispatch = rvAsm->GetCursorPointer();
    267     rvEmitCall(rvAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents));
    268 
    269     rvAsm->Bind(&skip_event_check);
    270   }
    271 
    272   // TODO: align?
    273   g_dispatcher = rvAsm->GetCursorPointer();
    274   {
    275     rvAsm->Bind(&dispatch);
    276 
    277     // x9 <- s_fast_map[pc >> 16]
    278     rvAsm->LWU(RARG1, PTR(&g_state.pc));
    279     rvMoveAddressToReg(rvAsm, RARG3, g_code_lut.data());
    280     rvAsm->SRLI(RARG2, RARG1, 16);
    281     rvAsm->SLLI(RARG1, RARG1, 1);
    282     rvAsm->SLLI(RARG2, RARG2, 3);
    283     rvAsm->ADD(RARG2, RARG2, RARG3);
    284     rvAsm->LD(RARG2, 0, RARG2);
    285 
    286     // blr(x9[pc * 2]) (fast_map[pc >> 2])
    287     rvAsm->ADD(RARG1, RARG1, RARG2);
    288     rvAsm->LD(RARG1, 0, RARG1);
    289     rvAsm->JR(RARG1);
    290   }
    291 
    292   g_compile_or_revalidate_block = rvAsm->GetCursorPointer();
    293   {
    294     rvAsm->LW(RARG1, PTR(&g_state.pc));
    295     rvEmitCall(rvAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock));
    296     rvAsm->J(&dispatch);
    297   }
    298 
    299   g_discard_and_recompile_block = rvAsm->GetCursorPointer();
    300   {
    301     rvAsm->LW(RARG1, PTR(&g_state.pc));
    302     rvEmitCall(rvAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock));
    303     rvAsm->J(&dispatch);
    304   }
    305 
    306   g_interpret_block = rvAsm->GetCursorPointer();
    307   {
    308     rvEmitCall(rvAsm, CodeCache::GetInterpretUncachedBlockFunction());
    309     rvAsm->J(&dispatch);
    310   }
    311 
    312   // TODO: align?
    313 
    314   return static_cast<u32>(rvAsm->GetCodeBuffer().GetSizeInBytes());
    315 }
    316 
    317 u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
    318 {
    319   // TODO: get rid of assembler construction here
    320   {
    321     biscuit::Assembler assembler(static_cast<u8*>(code), BLOCK_LINK_SIZE);
    322     CPU::Recompiler::rvEmitCall(&assembler, dst);
    323 
    324     DebugAssert(assembler.GetCodeBuffer().GetSizeInBytes() <= BLOCK_LINK_SIZE);
    325     if (assembler.GetCodeBuffer().GetRemainingBytes() > 0)
    326       assembler.NOP();
    327   }
    328 
    329   if (flush_icache)
    330     MemMap::FlushInstructionCache(code, BLOCK_LINK_SIZE);
    331 
    332   return BLOCK_LINK_SIZE;
    333 }
    334 
    335 CPU::NewRec::RISCV64Compiler::RISCV64Compiler() = default;
    336 
    337 CPU::NewRec::RISCV64Compiler::~RISCV64Compiler() = default;
    338 
    339 const void* CPU::NewRec::RISCV64Compiler::GetCurrentCodePointer()
    340 {
    341   return rvAsm->GetCursorPointer();
    342 }
    343 
    344 void CPU::NewRec::RISCV64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
    345                                          u8* far_code_buffer, u32 far_code_space)
    346 {
    347   Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
    348 
    349   // TODO: don't recreate this every time..
    350   DebugAssert(!m_emitter && !m_far_emitter && !rvAsm);
    351   m_emitter = std::make_unique<Assembler>(code_buffer, code_buffer_space);
    352   m_far_emitter = std::make_unique<Assembler>(far_code_buffer, far_code_space);
    353   rvAsm = m_emitter.get();
    354 
    355   // Need to wipe it out so it's correct when toggling fastmem.
    356   m_host_regs = {};
    357 
    358   const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.Index() : NUM_HOST_REGS;
    359   for (u32 i = 0; i < NUM_HOST_REGS; i++)
    360   {
    361     HostRegAlloc& hra = m_host_regs[i];
    362 
    363     if (i == RARG1.Index() || i == RARG2.Index() || i == RARG3.Index() || i == RSCRATCH.Index() ||
    364         i == RSTATE.Index() || i == membase_idx || i < 5 /* zero, ra, sp, gp, tp */)
    365     {
    366       continue;
    367     }
    368 
    369     hra.flags = HR_USABLE | (rvIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
    370   }
    371 }
    372 
    373 void CPU::NewRec::RISCV64Compiler::SwitchToFarCode(
    374   bool emit_jump,
    375   void (biscuit::Assembler::*inverted_cond)(biscuit::GPR, biscuit::GPR, biscuit::Label*) /* = nullptr */,
    376   const biscuit::GPR& rs1 /* = biscuit::zero */, const biscuit::GPR& rs2 /* = biscuit::zero */)
    377 {
    378   DebugAssert(rvAsm == m_emitter.get());
    379   if (emit_jump)
    380   {
    381     const void* target = m_far_emitter->GetCursorPointer();
    382     if (inverted_cond)
    383     {
    384       Label skip;
    385       (rvAsm->*inverted_cond)(rs1, rs2, &skip);
    386       rvEmitJmp(rvAsm, target);
    387       rvAsm->Bind(&skip);
    388     }
    389     else
    390     {
    391       rvEmitCall(rvAsm, target);
    392     }
    393   }
    394   rvAsm = m_far_emitter.get();
    395 }
    396 
    397 void CPU::NewRec::RISCV64Compiler::SwitchToNearCode(bool emit_jump)
    398 {
    399   DebugAssert(rvAsm == m_far_emitter.get());
    400   if (emit_jump)
    401     rvEmitJmp(rvAsm, m_emitter->GetCursorPointer());
    402   rvAsm = m_emitter.get();
    403 }
    404 
    405 void CPU::NewRec::RISCV64Compiler::EmitMov(const biscuit::GPR& dst, u32 val)
    406 {
    407   rvEmitMov(rvAsm, dst, val);
    408 }
    409 
    410 void CPU::NewRec::RISCV64Compiler::EmitCall(const void* ptr)
    411 {
    412   rvEmitCall(rvAsm, ptr);
    413 }
    414 
    415 void CPU::NewRec::RISCV64Compiler::SafeImmSExtIType(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm,
    416                                                     void (biscuit::Assembler::*iop)(GPR, GPR, u32),
    417                                                     void (biscuit::Assembler::*rop)(GPR, GPR, GPR))
    418 {
    419   DebugAssert(rd != RSCRATCH && rs != RSCRATCH);
    420 
    421   if (rvIsValidSExtITypeImm(imm))
    422   {
    423     (rvAsm->*iop)(rd, rs, imm);
    424     return;
    425   }
    426 
    427   rvEmitMov(rvAsm, RSCRATCH, imm);
    428   (rvAsm->*rop)(rd, rs, RSCRATCH);
    429 }
    430 
    431 void CPU::NewRec::RISCV64Compiler::SafeADDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
    432 {
    433   SafeImmSExtIType(rd, rs, imm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::ADDI),
    434                    &Assembler::ADD);
    435 }
    436 
    437 void CPU::NewRec::RISCV64Compiler::SafeADDIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
    438 {
    439   SafeImmSExtIType(rd, rs, imm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::ADDIW),
    440                    &Assembler::ADDW);
    441 }
    442 
    443 void CPU::NewRec::RISCV64Compiler::SafeSUBIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
    444 {
    445   const u32 nimm = static_cast<u32>(-static_cast<s32>(imm));
    446   SafeImmSExtIType(rd, rs, nimm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::ADDIW),
    447                    &Assembler::ADDW);
    448 }
    449 
    450 void CPU::NewRec::RISCV64Compiler::SafeANDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
    451 {
    452   SafeImmSExtIType(rd, rs, imm, &Assembler::ANDI, &Assembler::AND);
    453 }
    454 
    455 void CPU::NewRec::RISCV64Compiler::SafeORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
    456 {
    457   SafeImmSExtIType(rd, rs, imm, &Assembler::ORI, &Assembler::OR);
    458 }
    459 
    460 void CPU::NewRec::RISCV64Compiler::SafeXORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
    461 {
    462   SafeImmSExtIType(rd, rs, imm, &Assembler::XORI, &Assembler::XOR);
    463 }
    464 
    465 void CPU::NewRec::RISCV64Compiler::SafeSLTI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
    466 {
    467   SafeImmSExtIType(rd, rs, imm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::SLTI),
    468                    &Assembler::SLT);
    469 }
    470 
    471 void CPU::NewRec::RISCV64Compiler::SafeSLTIU(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
    472 {
    473   SafeImmSExtIType(rd, rs, imm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::SLTIU),
    474                    &Assembler::SLTU);
    475 }
    476 
    477 void CPU::NewRec::RISCV64Compiler::EmitSExtB(const biscuit::GPR& rd, const biscuit::GPR& rs)
    478 {
    479   rvEmitSExtB(rvAsm, rd, rs);
    480 }
    481 
    482 void CPU::NewRec::RISCV64Compiler::EmitUExtB(const biscuit::GPR& rd, const biscuit::GPR& rs)
    483 {
    484   rvEmitUExtB(rvAsm, rd, rs);
    485 }
    486 
    487 void CPU::NewRec::RISCV64Compiler::EmitSExtH(const biscuit::GPR& rd, const biscuit::GPR& rs)
    488 {
    489   rvEmitSExtH(rvAsm, rd, rs);
    490 }
    491 
    492 void CPU::NewRec::RISCV64Compiler::EmitUExtH(const biscuit::GPR& rd, const biscuit::GPR& rs)
    493 {
    494   rvEmitUExtH(rvAsm, rd, rs);
    495 }
    496 
    497 void CPU::NewRec::RISCV64Compiler::EmitDSExtW(const biscuit::GPR& rd, const biscuit::GPR& rs)
    498 {
    499   rvEmitDSExtW(rvAsm, rd, rs);
    500 }
    501 
    502 void CPU::NewRec::RISCV64Compiler::EmitDUExtW(const biscuit::GPR& rd, const biscuit::GPR& rs)
    503 {
    504   rvEmitDUExtW(rvAsm, rd, rs);
    505 }
    506 
    507 void CPU::NewRec::RISCV64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
    508 {
    509   // store it first to reduce code size, because we can offset
    510   // TODO: 64-bit displacement is needed :/
    511   // rvMoveAddressToReg(rvAsm, RARG1, ram_ptr);
    512   // rvMoveAddressToReg(rvAsm, RARG2, shadow_ptr);
    513   rvEmitMov64(rvAsm, RARG1, RSCRATCH, static_cast<u64>(reinterpret_cast<uintptr_t>(ram_ptr)));
    514   rvEmitMov64(rvAsm, RARG2, RSCRATCH, static_cast<u64>(reinterpret_cast<uintptr_t>(shadow_ptr)));
    515 
    516   u32 offset = 0;
    517   Label block_changed;
    518 
    519   while (size >= 8)
    520   {
    521     rvAsm->LD(RARG3, offset, RARG1);
    522     rvAsm->LD(RSCRATCH, offset, RARG2);
    523     rvAsm->BNE(RARG3, RSCRATCH, &block_changed);
    524     offset += 8;
    525     size -= 8;
    526   }
    527 
    528   while (size >= 4)
    529   {
    530     rvAsm->LWU(RARG3, offset, RARG1);
    531     rvAsm->LWU(RSCRATCH, offset, RARG2);
    532     rvAsm->BNE(RARG3, RSCRATCH, &block_changed);
    533     offset += 4;
    534     size -= 4;
    535   }
    536 
    537   DebugAssert(size == 0);
    538 
    539   Label block_unchanged;
    540   rvAsm->J(&block_unchanged);
    541   rvAsm->Bind(&block_changed);
    542   rvEmitJmp(rvAsm, CodeCache::g_discard_and_recompile_block);
    543   rvAsm->Bind(&block_unchanged);
    544 }
    545 
    546 void CPU::NewRec::RISCV64Compiler::GenerateICacheCheckAndUpdate()
    547 {
    548   if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
    549   {
    550     if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
    551     {
    552       rvEmitFarLoad(rvAsm, RARG2, GetFetchMemoryAccessTimePtr());
    553       rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
    554       rvEmitMov(rvAsm, RARG3, m_block->size);
    555       rvAsm->MULW(RARG2, RARG2, RARG3);
    556       rvAsm->ADD(RARG1, RARG1, RARG2);
    557       rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
    558     }
    559     else
    560     {
    561       rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
    562       SafeADDIW(RARG1, RARG1, static_cast<u32>(m_block->uncached_fetch_ticks));
    563       rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
    564     }
    565   }
    566   else if (m_block->icache_line_count > 0)
    567   {
    568     const auto& ticks_reg = RARG1;
    569     const auto& current_tag_reg = RARG2;
    570     const auto& existing_tag_reg = RARG3;
    571 
    572     VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
    573     rvAsm->LW(ticks_reg, PTR(&g_state.pending_ticks));
    574     rvEmitMov(rvAsm, current_tag_reg, current_pc);
    575 
    576     for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
    577     {
    578       const TickCount fill_ticks = GetICacheFillTicks(current_pc);
    579       if (fill_ticks <= 0)
    580         continue;
    581 
    582       const u32 line = GetICacheLine(current_pc);
    583       const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
    584 
    585       // TODO: Verify sign extension here...
    586       Label cache_hit;
    587       rvAsm->LW(existing_tag_reg, offset, RSTATE);
    588       rvAsm->BEQ(existing_tag_reg, current_tag_reg, &cache_hit);
    589 
    590       rvAsm->SW(current_tag_reg, offset, RSTATE);
    591       SafeADDIW(ticks_reg, ticks_reg, static_cast<u32>(fill_ticks));
    592       rvAsm->Bind(&cache_hit);
    593 
    594       if (i != (m_block->icache_line_count - 1))
    595         SafeADDIW(current_tag_reg, current_tag_reg, ICACHE_LINE_SIZE);
    596     }
    597 
    598     rvAsm->SW(ticks_reg, PTR(&g_state.pending_ticks));
    599   }
    600 }
    601 
    602 void CPU::NewRec::RISCV64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
    603                                                 s32 arg3reg /*= -1*/)
    604 {
    605   if (arg1reg >= 0 && arg1reg != static_cast<s32>(RARG1.Index()))
    606     rvAsm->MV(RARG1, GPR(arg1reg));
    607   if (arg2reg >= 0 && arg2reg != static_cast<s32>(RARG2.Index()))
    608     rvAsm->MV(RARG2, GPR(arg2reg));
    609   if (arg3reg >= 0 && arg3reg != static_cast<s32>(RARG3.Index()))
    610     rvAsm->MV(RARG3, GPR(arg3reg));
    611   EmitCall(func);
    612 }
    613 
    614 void CPU::NewRec::RISCV64Compiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
    615 {
    616   if (newpc.has_value())
    617   {
    618     if (m_dirty_pc || m_compiler_pc != newpc)
    619     {
    620       EmitMov(RSCRATCH, newpc.value());
    621       rvAsm->SW(RSCRATCH, PTR(&g_state.pc));
    622     }
    623   }
    624   m_dirty_pc = false;
    625 
    626   // flush regs
    627   Flush(FLUSH_END_BLOCK);
    628   EndAndLinkBlock(newpc, do_event_test, false);
    629 }
    630 
    631 void CPU::NewRec::RISCV64Compiler::EndBlockWithException(Exception excode)
    632 {
    633   // flush regs, but not pc, it's going to get overwritten
    634   // flush cycles because of the GTE instruction stuff...
    635   Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
    636 
    637   // TODO: flush load delay
    638   // TODO: break for pcdrv
    639 
    640   EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
    641                                                              inst->cop.cop_n));
    642   EmitMov(RARG2, m_current_instruction_pc);
    643   EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
    644   m_dirty_pc = false;
    645 
    646   EndAndLinkBlock(std::nullopt, true, false);
    647 }
    648 
    649 void CPU::NewRec::RISCV64Compiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test,
    650                                                    bool force_run_events)
    651 {
    652   // event test
    653   // pc should've been flushed
    654   DebugAssert(!m_dirty_pc && !m_block_ended);
    655   m_block_ended = true;
    656 
    657   // TODO: try extracting this to a function
    658   // TODO: move the cycle flush in here..
    659 
    660   // save cycles for event test
    661   const TickCount cycles = std::exchange(m_cycles, 0);
    662 
    663   // pending_ticks += cycles
    664   // if (pending_ticks >= downcount) { dispatch_event(); }
    665   if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
    666     rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
    667   if (do_event_test)
    668     rvAsm->LW(RARG2, PTR(&g_state.downcount));
    669   if (cycles > 0)
    670   {
    671     SafeADDIW(RARG1, RARG1, cycles);
    672     rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
    673   }
    674   if (m_gte_done_cycle > cycles)
    675   {
    676     SafeADDIW(RARG2, RARG1, m_gte_done_cycle - cycles);
    677     rvAsm->SW(RARG1, PTR(&g_state.gte_completion_tick));
    678   }
    679 
    680   if (do_event_test)
    681   {
    682     // TODO: see if we can do a far jump somehow with this..
    683     Label cont;
    684     rvAsm->BLT(RARG1, RARG2, &cont);
    685     rvEmitJmp(rvAsm, CodeCache::g_run_events_and_dispatch);
    686     rvAsm->Bind(&cont);
    687   }
    688 
    689   // jump to dispatcher or next block
    690   if (force_run_events)
    691   {
    692     rvEmitJmp(rvAsm, CodeCache::g_run_events_and_dispatch);
    693   }
    694   else if (!newpc.has_value())
    695   {
    696     rvEmitJmp(rvAsm, CodeCache::g_dispatcher);
    697   }
    698   else
    699   {
    700     if (newpc.value() == m_block->pc)
    701     {
    702       // Special case: ourselves! No need to backlink then.
    703       DEBUG_LOG("Linking block at {:08X} to self", m_block->pc);
    704       rvEmitJmp(rvAsm, rvAsm->GetBufferPointer(0));
    705     }
    706     else
    707     {
    708       const void* target = CreateBlockLink(m_block, rvAsm->GetCursorPointer(), newpc.value());
    709       rvEmitJmp(rvAsm, target);
    710     }
    711   }
    712 }
    713 
    714 const void* CPU::NewRec::RISCV64Compiler::EndCompile(u32* code_size, u32* far_code_size)
    715 {
    716   u8* const code = m_emitter->GetBufferPointer(0);
    717   *code_size = static_cast<u32>(m_emitter->GetCodeBuffer().GetSizeInBytes());
    718   *far_code_size = static_cast<u32>(m_far_emitter->GetCodeBuffer().GetSizeInBytes());
    719   rvAsm = nullptr;
    720   m_far_emitter.reset();
    721   m_emitter.reset();
    722   return code;
    723 }
    724 
    725 const char* CPU::NewRec::RISCV64Compiler::GetHostRegName(u32 reg) const
    726 {
    727   static constexpr std::array<const char*, 32> reg64_names = {
    728     {"zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0", "s1", "a0",  "a1",  "a2", "a3", "a4", "a5",
    729      "a6",   "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "t3", "t4", "t5", "t6"}};
    730   return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
    731 }
    732 
    733 void CPU::NewRec::RISCV64Compiler::LoadHostRegWithConstant(u32 reg, u32 val)
    734 {
    735   EmitMov(GPR(reg), val);
    736 }
    737 
    738 void CPU::NewRec::RISCV64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
    739 {
    740   rvAsm->LW(GPR(reg), PTR(ptr));
    741 }
    742 
    743 void CPU::NewRec::RISCV64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
    744 {
    745   rvAsm->SW(GPR(reg), PTR(ptr));
    746 }
    747 
    748 void CPU::NewRec::RISCV64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
    749 {
    750   if (val == 0)
    751   {
    752     rvAsm->SW(zero, PTR(ptr));
    753     return;
    754   }
    755 
    756   EmitMov(RSCRATCH, val);
    757   rvAsm->SW(RSCRATCH, PTR(ptr));
    758 }
    759 
    760 void CPU::NewRec::RISCV64Compiler::CopyHostReg(u32 dst, u32 src)
    761 {
    762   if (src != dst)
    763     rvAsm->MV(GPR(dst), GPR(src));
    764 }
    765 
    766 void CPU::NewRec::RISCV64Compiler::AssertRegOrConstS(CompileFlags cf) const
    767 {
    768   DebugAssert(cf.valid_host_s || cf.const_s);
    769 }
    770 
    771 void CPU::NewRec::RISCV64Compiler::AssertRegOrConstT(CompileFlags cf) const
    772 {
    773   DebugAssert(cf.valid_host_t || cf.const_t);
    774 }
    775 
    776 biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetSafeRegS(CompileFlags cf, const biscuit::GPR& temp_reg)
    777 {
    778   if (cf.valid_host_s)
    779   {
    780     return GPR(cf.host_s);
    781   }
    782   else if (cf.const_s)
    783   {
    784     if (HasConstantRegValue(cf.MipsS(), 0))
    785       return zero;
    786 
    787     EmitMov(temp_reg, GetConstantRegU32(cf.MipsS()));
    788     return temp_reg;
    789   }
    790   else
    791   {
    792     WARNING_LOG("Hit memory path in CFGetSafeRegS() for {}", GetRegName(cf.MipsS()));
    793     rvAsm->LW(temp_reg, PTR(&g_state.regs.r[cf.mips_s]));
    794     return temp_reg;
    795   }
    796 }
    797 
    798 biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetSafeRegT(CompileFlags cf, const biscuit::GPR& temp_reg)
    799 {
    800   if (cf.valid_host_t)
    801   {
    802     return GPR(cf.host_t);
    803   }
    804   else if (cf.const_t)
    805   {
    806     if (HasConstantRegValue(cf.MipsT(), 0))
    807       return zero;
    808 
    809     EmitMov(temp_reg, GetConstantRegU32(cf.MipsT()));
    810     return temp_reg;
    811   }
    812   else
    813   {
    814     WARNING_LOG("Hit memory path in CFGetSafeRegT() for {}", GetRegName(cf.MipsT()));
    815     rvAsm->LW(temp_reg, PTR(&g_state.regs.r[cf.mips_t]));
    816     return temp_reg;
    817   }
    818 }
    819 
    820 biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegD(CompileFlags cf) const
    821 {
    822   DebugAssert(cf.valid_host_d);
    823   return GPR(cf.host_d);
    824 }
    825 
    826 biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegS(CompileFlags cf) const
    827 {
    828   DebugAssert(cf.valid_host_s);
    829   return GPR(cf.host_s);
    830 }
    831 
    832 biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegT(CompileFlags cf) const
    833 {
    834   DebugAssert(cf.valid_host_t);
    835   return GPR(cf.host_t);
    836 }
    837 
    838 biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegLO(CompileFlags cf) const
    839 {
    840   DebugAssert(cf.valid_host_lo);
    841   return GPR(cf.host_lo);
    842 }
    843 
    844 biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegHI(CompileFlags cf) const
    845 {
    846   DebugAssert(cf.valid_host_hi);
    847   return GPR(cf.host_hi);
    848 }
    849 
    850 void CPU::NewRec::RISCV64Compiler::MoveSToReg(const biscuit::GPR& dst, CompileFlags cf)
    851 {
    852   if (cf.valid_host_s)
    853   {
    854     if (cf.host_s != dst.Index())
    855       rvAsm->MV(dst, GPR(cf.host_s));
    856   }
    857   else if (cf.const_s)
    858   {
    859     EmitMov(dst, GetConstantRegU32(cf.MipsS()));
    860   }
    861   else
    862   {
    863     WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
    864     rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s]));
    865   }
    866 }
    867 
    868 void CPU::NewRec::RISCV64Compiler::MoveTToReg(const biscuit::GPR& dst, CompileFlags cf)
    869 {
    870   if (cf.valid_host_t)
    871   {
    872     if (cf.host_t != dst.Index())
    873       rvAsm->MV(dst, GPR(cf.host_t));
    874   }
    875   else if (cf.const_t)
    876   {
    877     EmitMov(dst, GetConstantRegU32(cf.MipsT()));
    878   }
    879   else
    880   {
    881     WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
    882     rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_t]));
    883   }
    884 }
    885 
    886 void CPU::NewRec::RISCV64Compiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg)
    887 {
    888   DebugAssert(reg < Reg::count);
    889   if (const std::optional<u32> hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg))
    890     rvAsm->MV(dst, GPR(hreg.value()));
    891   else if (HasConstantReg(reg))
    892     EmitMov(dst, GetConstantRegU32(reg));
    893   else
    894     rvAsm->LW(dst, PTR(&g_state.regs.r[static_cast<u8>(reg)]));
    895 }
    896 
    897 void CPU::NewRec::RISCV64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
    898                                                                 Reg arg2reg /* = Reg::count */,
    899                                                                 Reg arg3reg /* = Reg::count */)
    900 {
    901   DebugAssert(g_settings.gpu_pgxp_enable);
    902 
    903   Flush(FLUSH_FOR_C_CALL);
    904 
    905   if (arg2reg != Reg::count)
    906     MoveMIPSRegToReg(RARG2, arg2reg);
    907   if (arg3reg != Reg::count)
    908     MoveMIPSRegToReg(RARG3, arg3reg);
    909 
    910   EmitMov(RARG1, arg1val);
    911   EmitCall(func);
    912 }
    913 
    914 void CPU::NewRec::RISCV64Compiler::Flush(u32 flags)
    915 {
    916   Compiler::Flush(flags);
    917 
    918   if (flags & FLUSH_PC && m_dirty_pc)
    919   {
    920     StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
    921     m_dirty_pc = false;
    922   }
    923 
    924   if (flags & FLUSH_INSTRUCTION_BITS)
    925   {
    926     // This sucks, but it's only used for fallbacks.
    927     Panic("Not implemented");
    928   }
    929 
    930   if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
    931   {
    932     // This sucks :(
    933     // TODO: make it a function?
    934     rvAsm->LBU(RARG1, PTR(&g_state.load_delay_reg));
    935     rvAsm->LW(RARG2, PTR(&g_state.load_delay_value));
    936     rvAsm->SLLI(RARG1, RARG1, 2); // *4
    937     rvAsm->ADD(RARG1, RARG1, RSTATE);
    938     rvAsm->SW(RARG2, OFFSETOF(CPU::State, regs.r[0]), RARG1);
    939     rvAsm->LI(RSCRATCH, static_cast<u8>(Reg::count));
    940     rvAsm->SB(RSCRATCH, PTR(&g_state.load_delay_reg));
    941     m_load_delay_dirty = false;
    942   }
    943 
    944   if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
    945   {
    946     if (m_load_delay_value_register != NUM_HOST_REGS)
    947       FreeHostReg(m_load_delay_value_register);
    948 
    949     EmitMov(RSCRATCH, static_cast<u8>(m_load_delay_register));
    950     rvAsm->SB(RSCRATCH, PTR(&g_state.load_delay_reg));
    951     m_load_delay_register = Reg::count;
    952     m_load_delay_dirty = true;
    953   }
    954 
    955   if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
    956   {
    957     // May as well flush cycles while we're here.
    958     // GTE spanning blocks is very rare, we _could_ disable this for speed.
    959     rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
    960     rvAsm->LW(RARG2, PTR(&g_state.gte_completion_tick));
    961     if (m_cycles > 0)
    962     {
    963       SafeADDIW(RARG1, RARG1, m_cycles);
    964       m_cycles = 0;
    965     }
    966     Label no_stall;
    967     rvAsm->BGE(RARG1, RARG2, &no_stall);
    968     rvAsm->MV(RARG1, RARG2);
    969     rvAsm->Bind(&no_stall);
    970     rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
    971     m_dirty_gte_done_cycle = false;
    972   }
    973 
    974   if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
    975   {
    976     rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
    977 
    978     // update cycles at the same time
    979     if (flags & FLUSH_CYCLES && m_cycles > 0)
    980     {
    981       SafeADDIW(RARG1, RARG1, m_cycles);
    982       rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
    983       m_gte_done_cycle -= m_cycles;
    984       m_cycles = 0;
    985     }
    986 
    987     SafeADDIW(RARG1, RARG1, m_gte_done_cycle);
    988     rvAsm->SW(RARG1, PTR(&g_state.gte_completion_tick));
    989     m_gte_done_cycle = 0;
    990     m_dirty_gte_done_cycle = true;
    991   }
    992 
    993   if (flags & FLUSH_CYCLES && m_cycles > 0)
    994   {
    995     rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
    996     SafeADDIW(RARG1, RARG1, m_cycles);
    997     rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
    998     m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
    999     m_cycles = 0;
   1000   }
   1001 }
   1002 
   1003 void CPU::NewRec::RISCV64Compiler::Compile_Fallback()
   1004 {
   1005   WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", iinfo->pc, inst->bits);
   1006 
   1007   Flush(FLUSH_FOR_INTERPRETER);
   1008 
   1009 #if 0
   1010   cg->call(&CPU::Recompiler::Thunks::InterpretInstruction);
   1011 
   1012   // TODO: make me less garbage
   1013   // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
   1014   // but nothing should be going through here..
   1015   Label no_load_delay;
   1016   cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]);
   1017   cg->cmp(RWARG1, static_cast<u8>(Reg::count));
   1018   cg->je(no_load_delay, CodeGenerator::T_SHORT);
   1019   cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]);
   1020   cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1);
   1021   cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2);
   1022   cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast<u32>(Reg::count));
   1023   cg->L(no_load_delay);
   1024 
   1025   m_load_delay_dirty = EMULATE_LOAD_DELAYS;
   1026 #else
   1027   Panic("Fixme");
   1028 #endif
   1029 }
   1030 
   1031 void CPU::NewRec::RISCV64Compiler::CheckBranchTarget(const biscuit::GPR& pcreg)
   1032 {
   1033   if (!g_settings.cpu_recompiler_memory_exceptions)
   1034     return;
   1035 
   1036   DebugAssert(pcreg != RSCRATCH);
   1037   rvAsm->ANDI(RSCRATCH, pcreg, 0x3);
   1038   SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero);
   1039 
   1040   BackupHostState();
   1041   EndBlockWithException(Exception::AdEL);
   1042 
   1043   RestoreHostState();
   1044   SwitchToNearCode(false);
   1045 }
   1046 
   1047 void CPU::NewRec::RISCV64Compiler::Compile_jr(CompileFlags cf)
   1048 {
   1049   const GPR pcreg = CFGetRegS(cf);
   1050   CheckBranchTarget(pcreg);
   1051 
   1052   rvAsm->SW(pcreg, PTR(&g_state.pc));
   1053 
   1054   CompileBranchDelaySlot(false);
   1055   EndBlock(std::nullopt, true);
   1056 }
   1057 
   1058 void CPU::NewRec::RISCV64Compiler::Compile_jalr(CompileFlags cf)
   1059 {
   1060   const GPR pcreg = CFGetRegS(cf);
   1061   if (MipsD() != Reg::zero)
   1062     SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
   1063 
   1064   CheckBranchTarget(pcreg);
   1065   rvAsm->SW(pcreg, PTR(&g_state.pc));
   1066 
   1067   CompileBranchDelaySlot(false);
   1068   EndBlock(std::nullopt, true);
   1069 }
   1070 
   1071 void CPU::NewRec::RISCV64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
   1072 {
   1073   AssertRegOrConstS(cf);
   1074 
   1075   const u32 taken_pc = GetConditionalBranchTarget(cf);
   1076 
   1077   Flush(FLUSH_FOR_BRANCH);
   1078 
   1079   DebugAssert(cf.valid_host_s);
   1080 
   1081   // MipsT() here should equal zero for zero branches.
   1082   DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
   1083 
   1084   Label taken;
   1085   const GPR rs = CFGetRegS(cf);
   1086   switch (cond)
   1087   {
   1088     case BranchCondition::Equal:
   1089     case BranchCondition::NotEqual:
   1090     {
   1091       AssertRegOrConstT(cf);
   1092       if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
   1093       {
   1094         (cond == BranchCondition::Equal) ? rvAsm->BEQZ(rs, &taken) : rvAsm->BNEZ(rs, &taken);
   1095       }
   1096       else
   1097       {
   1098         const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG1;
   1099         if (!cf.valid_host_t)
   1100           MoveTToReg(RARG1, cf);
   1101         if (cond == Compiler::BranchCondition::Equal)
   1102           rvAsm->BEQ(rs, rt, &taken);
   1103         else
   1104           rvAsm->BNE(rs, rt, &taken);
   1105       }
   1106     }
   1107     break;
   1108 
   1109     case BranchCondition::GreaterThanZero:
   1110     {
   1111       rvAsm->BGTZ(rs, &taken);
   1112     }
   1113     break;
   1114 
   1115     case BranchCondition::GreaterEqualZero:
   1116     {
   1117       rvAsm->BGEZ(rs, &taken);
   1118     }
   1119     break;
   1120 
   1121     case BranchCondition::LessThanZero:
   1122     {
   1123       rvAsm->BLTZ(rs, &taken);
   1124     }
   1125     break;
   1126 
   1127     case BranchCondition::LessEqualZero:
   1128     {
   1129       rvAsm->BLEZ(rs, &taken);
   1130     }
   1131     break;
   1132   }
   1133 
   1134   BackupHostState();
   1135   if (!cf.delay_slot_swapped)
   1136     CompileBranchDelaySlot();
   1137 
   1138   EndBlock(m_compiler_pc, true);
   1139 
   1140   rvAsm->Bind(&taken);
   1141 
   1142   RestoreHostState();
   1143   if (!cf.delay_slot_swapped)
   1144     CompileBranchDelaySlot();
   1145 
   1146   EndBlock(taken_pc, true);
   1147 }
   1148 
   1149 void CPU::NewRec::RISCV64Compiler::Compile_addi(CompileFlags cf, bool overflow)
   1150 {
   1151   const GPR rs = CFGetRegS(cf);
   1152   const GPR rt = CFGetRegT(cf);
   1153   if (const u32 imm = inst->i.imm_sext32(); imm != 0)
   1154   {
   1155     if (!overflow)
   1156     {
   1157       SafeADDIW(rt, rs, imm);
   1158     }
   1159     else
   1160     {
   1161       SafeADDI(RARG1, rs, imm);
   1162       SafeADDIW(rt, rs, imm);
   1163       TestOverflow(RARG1, rt, rt);
   1164     }
   1165   }
   1166   else if (rt.Index() != rs.Index())
   1167   {
   1168     rvAsm->MV(rt, rs);
   1169   }
   1170 }
   1171 
   1172 void CPU::NewRec::RISCV64Compiler::Compile_addi(CompileFlags cf)
   1173 {
   1174   Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
   1175 }
   1176 
   1177 void CPU::NewRec::RISCV64Compiler::Compile_addiu(CompileFlags cf)
   1178 {
   1179   Compile_addi(cf, false);
   1180 }
   1181 
   1182 void CPU::NewRec::RISCV64Compiler::Compile_slti(CompileFlags cf)
   1183 {
   1184   Compile_slti(cf, true);
   1185 }
   1186 
   1187 void CPU::NewRec::RISCV64Compiler::Compile_sltiu(CompileFlags cf)
   1188 {
   1189   Compile_slti(cf, false);
   1190 }
   1191 
   1192 void CPU::NewRec::RISCV64Compiler::Compile_slti(CompileFlags cf, bool sign)
   1193 {
   1194   if (sign)
   1195     SafeSLTI(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32());
   1196   else
   1197     SafeSLTIU(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32());
   1198 }
   1199 
   1200 void CPU::NewRec::RISCV64Compiler::Compile_andi(CompileFlags cf)
   1201 {
   1202   const GPR rt = CFGetRegT(cf);
   1203   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
   1204     SafeANDI(rt, CFGetRegS(cf), imm);
   1205   else
   1206     EmitMov(rt, 0);
   1207 }
   1208 
   1209 void CPU::NewRec::RISCV64Compiler::Compile_ori(CompileFlags cf)
   1210 {
   1211   const GPR rt = CFGetRegT(cf);
   1212   const GPR rs = CFGetRegS(cf);
   1213   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
   1214     SafeORI(rt, rs, imm);
   1215   else if (rt.Index() != rs.Index())
   1216     rvAsm->MV(rt, rs);
   1217 }
   1218 
   1219 void CPU::NewRec::RISCV64Compiler::Compile_xori(CompileFlags cf)
   1220 {
   1221   const GPR rt = CFGetRegT(cf);
   1222   const GPR rs = CFGetRegS(cf);
   1223   if (const u32 imm = inst->i.imm_zext32(); imm != 0)
   1224     SafeXORI(rt, rs, imm);
   1225   else if (rt.Index() != rs.Index())
   1226     rvAsm->MV(rt, rs);
   1227 }
   1228 
   1229 void CPU::NewRec::RISCV64Compiler::Compile_shift(
   1230   CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
   1231   void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned))
   1232 {
   1233   const GPR rd = CFGetRegD(cf);
   1234   const GPR rt = CFGetRegT(cf);
   1235   if (inst->r.shamt > 0)
   1236     (rvAsm->*op_const)(rd, rt, inst->r.shamt);
   1237   else if (rd.Index() != rt.Index())
   1238     rvAsm->MV(rd, rt);
   1239 }
   1240 
   1241 void CPU::NewRec::RISCV64Compiler::Compile_sll(CompileFlags cf)
   1242 {
   1243   Compile_shift(cf, &Assembler::SLLW, &Assembler::SLLIW);
   1244 }
   1245 
   1246 void CPU::NewRec::RISCV64Compiler::Compile_srl(CompileFlags cf)
   1247 {
   1248   Compile_shift(cf, &Assembler::SRLW, &Assembler::SRLIW);
   1249 }
   1250 
   1251 void CPU::NewRec::RISCV64Compiler::Compile_sra(CompileFlags cf)
   1252 {
   1253   Compile_shift(cf, &Assembler::SRAW, &Assembler::SRAIW);
   1254 }
   1255 
   1256 void CPU::NewRec::RISCV64Compiler::Compile_variable_shift(
   1257   CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
   1258   void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned))
   1259 {
   1260   const GPR rd = CFGetRegD(cf);
   1261 
   1262   AssertRegOrConstS(cf);
   1263   AssertRegOrConstT(cf);
   1264 
   1265   const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   1266   if (!cf.valid_host_t)
   1267     MoveTToReg(rt, cf);
   1268 
   1269   if (cf.const_s)
   1270   {
   1271     if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
   1272       (rvAsm->*op_const)(rd, rt, shift & 31u);
   1273     else if (rd.Index() != rt.Index())
   1274       rvAsm->MV(rd, rt);
   1275   }
   1276   else
   1277   {
   1278     (rvAsm->*op)(rd, rt, CFGetRegS(cf));
   1279   }
   1280 }
   1281 
   1282 void CPU::NewRec::RISCV64Compiler::Compile_sllv(CompileFlags cf)
   1283 {
   1284   Compile_variable_shift(cf, &Assembler::SLLW, &Assembler::SLLIW);
   1285 }
   1286 
   1287 void CPU::NewRec::RISCV64Compiler::Compile_srlv(CompileFlags cf)
   1288 {
   1289   Compile_variable_shift(cf, &Assembler::SRLW, &Assembler::SRLIW);
   1290 }
   1291 
   1292 void CPU::NewRec::RISCV64Compiler::Compile_srav(CompileFlags cf)
   1293 {
   1294   Compile_variable_shift(cf, &Assembler::SRAW, &Assembler::SRAIW);
   1295 }
   1296 
   1297 void CPU::NewRec::RISCV64Compiler::Compile_mult(CompileFlags cf, bool sign)
   1298 {
   1299   const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
   1300   if (!cf.valid_host_s)
   1301     MoveSToReg(rs, cf);
   1302 
   1303   const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   1304   if (!cf.valid_host_t)
   1305     MoveTToReg(rt, cf);
   1306 
   1307   // TODO: if lo/hi gets killed, we can use a 32-bit multiply
   1308   const GPR lo = CFGetRegLO(cf);
   1309   const GPR hi = CFGetRegHI(cf);
   1310 
   1311   if (sign)
   1312   {
   1313     rvAsm->MUL(lo, rs, rt);
   1314     rvAsm->SRAI64(hi, lo, 32);
   1315     EmitDSExtW(lo, lo);
   1316   }
   1317   else
   1318   {
   1319     // Need to make it unsigned.
   1320     EmitDUExtW(RARG1, rs);
   1321     EmitDUExtW(RARG2, rt);
   1322     rvAsm->MUL(lo, RARG1, RARG2);
   1323     rvAsm->SRAI64(hi, lo, 32);
   1324     EmitDSExtW(lo, lo);
   1325   }
   1326 }
   1327 
   1328 void CPU::NewRec::RISCV64Compiler::Compile_mult(CompileFlags cf)
   1329 {
   1330   Compile_mult(cf, true);
   1331 }
   1332 
   1333 void CPU::NewRec::RISCV64Compiler::Compile_multu(CompileFlags cf)
   1334 {
   1335   Compile_mult(cf, false);
   1336 }
   1337 
   1338 void CPU::NewRec::RISCV64Compiler::Compile_div(CompileFlags cf)
   1339 {
   1340   // 36 Volume I: RISC-V User-Level ISA V2.2
   1341   const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
   1342   if (!cf.valid_host_s)
   1343     MoveSToReg(rs, cf);
   1344 
   1345   const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   1346   if (!cf.valid_host_t)
   1347     MoveTToReg(rt, cf);
   1348 
   1349   const GPR rlo = CFGetRegLO(cf);
   1350   const GPR rhi = CFGetRegHI(cf);
   1351 
   1352   Label done;
   1353   Label not_divide_by_zero;
   1354   rvAsm->BNEZ(rt, &not_divide_by_zero);
   1355   rvAsm->MV(rhi, rs); // hi = num
   1356   rvAsm->SRAI64(rlo, rs, 63);
   1357   rvAsm->ANDI(rlo, rlo, 2);
   1358   rvAsm->ADDI(rlo, rlo, -1); // lo = s >= 0 ? -1 : 1
   1359   rvAsm->J(&done);
   1360 
   1361   rvAsm->Bind(&not_divide_by_zero);
   1362   Label not_unrepresentable;
   1363   EmitMov(RSCRATCH, static_cast<u32>(-1));
   1364   rvAsm->BNE(rt, RSCRATCH, &not_unrepresentable);
   1365   EmitMov(rlo, 0x80000000u);
   1366   rvAsm->BNE(rs, rlo, &not_unrepresentable);
   1367   EmitMov(rhi, 0);
   1368   rvAsm->J(&done);
   1369 
   1370   rvAsm->Bind(&not_unrepresentable);
   1371 
   1372   rvAsm->DIVW(rlo, rs, rt);
   1373   rvAsm->REMW(rhi, rs, rt);
   1374 
   1375   rvAsm->Bind(&done);
   1376 }
   1377 
   1378 void CPU::NewRec::RISCV64Compiler::Compile_divu(CompileFlags cf)
   1379 {
   1380   const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
   1381   if (!cf.valid_host_s)
   1382     MoveSToReg(rs, cf);
   1383 
   1384   const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   1385   if (!cf.valid_host_t)
   1386     MoveTToReg(rt, cf);
   1387 
   1388   const GPR rlo = CFGetRegLO(cf);
   1389   const GPR rhi = CFGetRegHI(cf);
   1390 
   1391   // Semantics match? :-)
   1392   rvAsm->DIVUW(rlo, rs, rt);
   1393   rvAsm->REMUW(rhi, rs, rt);
   1394 }
   1395 
   1396 void CPU::NewRec::RISCV64Compiler::TestOverflow(const biscuit::GPR& long_res, const biscuit::GPR& res,
   1397                                                 const biscuit::GPR& reg_to_discard)
   1398 {
   1399   SwitchToFarCode(true, &Assembler::BEQ, long_res, res);
   1400 
   1401   BackupHostState();
   1402 
   1403   // toss the result
   1404   ClearHostReg(reg_to_discard.Index());
   1405 
   1406   EndBlockWithException(Exception::Ov);
   1407 
   1408   RestoreHostState();
   1409 
   1410   SwitchToNearCode(false);
   1411 }
   1412 
   1413 void CPU::NewRec::RISCV64Compiler::Compile_dst_op(
   1414   CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
   1415   void (RISCV64Compiler::*op_const)(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm),
   1416   void (biscuit::Assembler::*op_long)(biscuit::GPR, biscuit::GPR, biscuit::GPR), bool commutative, bool overflow)
   1417 {
   1418   AssertRegOrConstS(cf);
   1419   AssertRegOrConstT(cf);
   1420 
   1421   const GPR rd = CFGetRegD(cf);
   1422 
   1423   if (overflow)
   1424   {
   1425     const GPR rs = CFGetSafeRegS(cf, RARG1);
   1426     const GPR rt = CFGetSafeRegT(cf, RARG2);
   1427     (rvAsm->*op)(RARG3, rs, rt);
   1428     (rvAsm->*op_long)(rd, rs, rt);
   1429     TestOverflow(RARG3, rd, rd);
   1430     return;
   1431   }
   1432 
   1433   if (cf.valid_host_s && cf.valid_host_t)
   1434   {
   1435     (rvAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
   1436   }
   1437   else if (commutative && (cf.const_s || cf.const_t))
   1438   {
   1439     const GPR src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
   1440     if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
   1441     {
   1442       (this->*op_const)(rd, src, cv);
   1443     }
   1444     else
   1445     {
   1446       if (rd.Index() != src.Index())
   1447         rvAsm->MV(rd, src);
   1448       overflow = false;
   1449     }
   1450   }
   1451   else if (cf.const_s)
   1452   {
   1453     if (HasConstantRegValue(cf.MipsS(), 0))
   1454     {
   1455       (rvAsm->*op)(rd, zero, CFGetRegT(cf));
   1456     }
   1457     else
   1458     {
   1459       EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS()));
   1460       (rvAsm->*op)(rd, RSCRATCH, CFGetRegT(cf));
   1461     }
   1462   }
   1463   else if (cf.const_t)
   1464   {
   1465     const GPR rs = CFGetRegS(cf);
   1466     if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
   1467     {
   1468       (this->*op_const)(rd, rs, cv);
   1469     }
   1470     else
   1471     {
   1472       if (rd.Index() != rs.Index())
   1473         rvAsm->MV(rd, rs);
   1474       overflow = false;
   1475     }
   1476   }
   1477 }
   1478 
   1479 void CPU::NewRec::RISCV64Compiler::Compile_add(CompileFlags cf)
   1480 {
   1481   Compile_dst_op(cf, &Assembler::ADDW, &RISCV64Compiler::SafeADDIW, &Assembler::ADD, true,
   1482                  g_settings.cpu_recompiler_memory_exceptions);
   1483 }
   1484 
   1485 void CPU::NewRec::RISCV64Compiler::Compile_addu(CompileFlags cf)
   1486 {
   1487   Compile_dst_op(cf, &Assembler::ADDW, &RISCV64Compiler::SafeADDIW, &Assembler::ADD, true, false);
   1488 }
   1489 
   1490 void CPU::NewRec::RISCV64Compiler::Compile_sub(CompileFlags cf)
   1491 {
   1492   Compile_dst_op(cf, &Assembler::SUBW, &RISCV64Compiler::SafeSUBIW, &Assembler::SUB, false,
   1493                  g_settings.cpu_recompiler_memory_exceptions);
   1494 }
   1495 
   1496 void CPU::NewRec::RISCV64Compiler::Compile_subu(CompileFlags cf)
   1497 {
   1498   Compile_dst_op(cf, &Assembler::SUBW, &RISCV64Compiler::SafeSUBIW, &Assembler::SUB, false, false);
   1499 }
   1500 
   1501 void CPU::NewRec::RISCV64Compiler::Compile_and(CompileFlags cf)
   1502 {
   1503   AssertRegOrConstS(cf);
   1504   AssertRegOrConstT(cf);
   1505 
   1506   // special cases - and with self -> self, and with 0 -> 0
   1507   const GPR regd = CFGetRegD(cf);
   1508   if (cf.MipsS() == cf.MipsT())
   1509   {
   1510     rvAsm->MV(regd, CFGetRegS(cf));
   1511     return;
   1512   }
   1513   else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
   1514   {
   1515     EmitMov(regd, 0);
   1516     return;
   1517   }
   1518 
   1519   Compile_dst_op(cf, &Assembler::AND, &RISCV64Compiler::SafeANDI, &Assembler::AND, true, false);
   1520 }
   1521 
   1522 void CPU::NewRec::RISCV64Compiler::Compile_or(CompileFlags cf)
   1523 {
   1524   AssertRegOrConstS(cf);
   1525   AssertRegOrConstT(cf);
   1526 
   1527   // or/nor with 0 -> no effect
   1528   const GPR regd = CFGetRegD(cf);
   1529   if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
   1530   {
   1531     cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
   1532     return;
   1533   }
   1534 
   1535   Compile_dst_op(cf, &Assembler::OR, &RISCV64Compiler::SafeORI, &Assembler::OR, true, false);
   1536 }
   1537 
   1538 void CPU::NewRec::RISCV64Compiler::Compile_xor(CompileFlags cf)
   1539 {
   1540   AssertRegOrConstS(cf);
   1541   AssertRegOrConstT(cf);
   1542 
   1543   const GPR regd = CFGetRegD(cf);
   1544   if (cf.MipsS() == cf.MipsT())
   1545   {
   1546     // xor with self -> zero
   1547     EmitMov(regd, 0);
   1548     return;
   1549   }
   1550   else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
   1551   {
   1552     // xor with zero -> no effect
   1553     cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
   1554     return;
   1555   }
   1556 
   1557   Compile_dst_op(cf, &Assembler::XOR, &RISCV64Compiler::SafeXORI, &Assembler::XOR, true, false);
   1558 }
   1559 
   1560 void CPU::NewRec::RISCV64Compiler::Compile_nor(CompileFlags cf)
   1561 {
   1562   Compile_or(cf);
   1563   rvAsm->NOT(CFGetRegD(cf), CFGetRegD(cf));
   1564 }
   1565 
   1566 void CPU::NewRec::RISCV64Compiler::Compile_slt(CompileFlags cf)
   1567 {
   1568   Compile_slt(cf, true);
   1569 }
   1570 
   1571 void CPU::NewRec::RISCV64Compiler::Compile_sltu(CompileFlags cf)
   1572 {
   1573   Compile_slt(cf, false);
   1574 }
   1575 
   1576 void CPU::NewRec::RISCV64Compiler::Compile_slt(CompileFlags cf, bool sign)
   1577 {
   1578   AssertRegOrConstS(cf);
   1579   AssertRegOrConstT(cf);
   1580 
   1581   const GPR rd = CFGetRegD(cf);
   1582   const GPR rs = CFGetSafeRegS(cf, RARG1);
   1583 
   1584   if (cf.const_t && rvIsValidSExtITypeImm(GetConstantRegU32(cf.MipsT())))
   1585   {
   1586     if (sign)
   1587       rvAsm->SLTI(rd, rs, GetConstantRegS32(cf.MipsT()));
   1588     else
   1589       rvAsm->SLTIU(rd, rs, GetConstantRegS32(cf.MipsT()));
   1590   }
   1591   else
   1592   {
   1593     const GPR rt = CFGetSafeRegT(cf, RARG2);
   1594     if (sign)
   1595       rvAsm->SLT(rd, rs, rt);
   1596     else
   1597       rvAsm->SLTU(rd, rs, rt);
   1598   }
   1599 }
   1600 
   1601 biscuit::GPR CPU::NewRec::RISCV64Compiler::ComputeLoadStoreAddressArg(
   1602   CompileFlags cf, const std::optional<VirtualMemoryAddress>& address, const std::optional<const biscuit::GPR>& reg)
   1603 {
   1604   const u32 imm = inst->i.imm_sext32();
   1605   if (cf.valid_host_s && imm == 0 && !reg.has_value())
   1606     return CFGetRegS(cf);
   1607 
   1608   const GPR dst = reg.has_value() ? reg.value() : RARG1;
   1609   if (address.has_value())
   1610   {
   1611     EmitMov(dst, address.value());
   1612   }
   1613   else if (imm == 0)
   1614   {
   1615     if (cf.valid_host_s)
   1616     {
   1617       if (const GPR src = CFGetRegS(cf); src.Index() != dst.Index())
   1618         rvAsm->MV(dst, CFGetRegS(cf));
   1619     }
   1620     else
   1621     {
   1622       rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s]));
   1623     }
   1624   }
   1625   else
   1626   {
   1627     if (cf.valid_host_s)
   1628     {
   1629       SafeADDIW(dst, CFGetRegS(cf), inst->i.imm_sext32());
   1630     }
   1631     else
   1632     {
   1633       rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s]));
   1634       SafeADDIW(dst, dst, inst->i.imm_sext32());
   1635     }
   1636   }
   1637 
   1638   return dst;
   1639 }
   1640 
   1641 template<typename RegAllocFn>
   1642 biscuit::GPR CPU::NewRec::RISCV64Compiler::GenerateLoad(const biscuit::GPR& addr_reg, MemoryAccessSize size, bool sign,
   1643                                                         bool use_fastmem, const RegAllocFn& dst_reg_alloc)
   1644 {
   1645   if (use_fastmem)
   1646   {
   1647     m_cycles += Bus::RAM_READ_TICKS;
   1648 
   1649     // TODO: Make this better. If we're loading the address from state, we can use LWU instead, and skip this.
   1650     // TODO: LUT fastmem
   1651     const GPR dst = dst_reg_alloc();
   1652     rvAsm->SLLI64(RSCRATCH, addr_reg, 32);
   1653     rvAsm->SRLI64(RSCRATCH, RSCRATCH, 32);
   1654 
   1655     if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
   1656     {
   1657       DebugAssert(addr_reg.Index() != RARG3.Index());
   1658       rvAsm->SRLI64(RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT);
   1659       rvAsm->SLLI64(RARG3, RARG3, 8);
   1660       rvAsm->ADD(RARG3, RARG3, RMEMBASE);
   1661       rvAsm->LD(RARG3, 0, RARG3);
   1662       rvAsm->ADD(RSCRATCH, RSCRATCH, RARG3);
   1663     }
   1664     else
   1665     {
   1666       rvAsm->ADD(RSCRATCH, RSCRATCH, RMEMBASE);
   1667     }
   1668 
   1669     u8* start = m_emitter->GetCursorPointer();
   1670     switch (size)
   1671     {
   1672       case MemoryAccessSize::Byte:
   1673         sign ? rvAsm->LB(dst, 0, RSCRATCH) : rvAsm->LBU(dst, 0, RSCRATCH);
   1674         break;
   1675 
   1676       case MemoryAccessSize::HalfWord:
   1677         sign ? rvAsm->LH(dst, 0, RSCRATCH) : rvAsm->LHU(dst, 0, RSCRATCH);
   1678         break;
   1679 
   1680       case MemoryAccessSize::Word:
   1681         rvAsm->LW(dst, 0, RSCRATCH);
   1682         break;
   1683     }
   1684 
   1685     // We need a nop, because the slowmem jump might be more than 1MB away.
   1686     rvAsm->NOP();
   1687 
   1688     AddLoadStoreInfo(start, 8, addr_reg.Index(), dst.Index(), size, sign, true);
   1689     return dst;
   1690   }
   1691 
   1692   if (addr_reg.Index() != RARG1.Index())
   1693     rvAsm->MV(RARG1, addr_reg);
   1694 
   1695   const bool checked = g_settings.cpu_recompiler_memory_exceptions;
   1696   switch (size)
   1697   {
   1698     case MemoryAccessSize::Byte:
   1699     {
   1700       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryByte) :
   1701                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte));
   1702     }
   1703     break;
   1704     case MemoryAccessSize::HalfWord:
   1705     {
   1706       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryHalfWord) :
   1707                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord));
   1708     }
   1709     break;
   1710     case MemoryAccessSize::Word:
   1711     {
   1712       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryWord) :
   1713                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord));
   1714     }
   1715     break;
   1716   }
   1717 
   1718   // TODO: turn this into an asm function instead
   1719   if (checked)
   1720   {
   1721     rvAsm->SRLI64(RSCRATCH, RRET, 63);
   1722     SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero);
   1723     BackupHostState();
   1724 
   1725     // Need to stash this in a temp because of the flush.
   1726     const GPR temp = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
   1727     rvAsm->NEG(temp, RRET);
   1728     rvAsm->SLLIW(temp, temp, 2);
   1729 
   1730     Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
   1731 
   1732     // cause_bits = (-result << 2) | BD | cop_n
   1733     SafeORI(RARG1, temp,
   1734             Cop0Registers::CAUSE::MakeValueForException(
   1735               static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
   1736     EmitMov(RARG2, m_current_instruction_pc);
   1737     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   1738     FreeHostReg(temp.Index());
   1739     EndBlock(std::nullopt, true);
   1740 
   1741     RestoreHostState();
   1742     SwitchToNearCode(false);
   1743   }
   1744 
   1745   const GPR dst_reg = dst_reg_alloc();
   1746   switch (size)
   1747   {
   1748     case MemoryAccessSize::Byte:
   1749     {
   1750       sign ? EmitSExtB(dst_reg, RRET) : EmitUExtB(dst_reg, RRET);
   1751     }
   1752     break;
   1753     case MemoryAccessSize::HalfWord:
   1754     {
   1755       sign ? EmitSExtH(dst_reg, RRET) : EmitUExtH(dst_reg, RRET);
   1756     }
   1757     break;
   1758     case MemoryAccessSize::Word:
   1759     {
   1760       // Need to undo the zero-extend.
   1761       if (checked)
   1762         rvEmitDSExtW(rvAsm, dst_reg, RRET);
   1763       else if (dst_reg.Index() != RRET.Index())
   1764         rvAsm->MV(dst_reg, RRET);
   1765     }
   1766     break;
   1767   }
   1768 
   1769   return dst_reg;
   1770 }
   1771 
   1772 void CPU::NewRec::RISCV64Compiler::GenerateStore(const biscuit::GPR& addr_reg, const biscuit::GPR& value_reg,
   1773                                                  MemoryAccessSize size, bool use_fastmem)
   1774 {
   1775   if (use_fastmem)
   1776   {
   1777     DebugAssert(value_reg != RSCRATCH);
   1778     rvAsm->SLLI64(RSCRATCH, addr_reg, 32);
   1779     rvAsm->SRLI64(RSCRATCH, RSCRATCH, 32);
   1780 
   1781     if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
   1782     {
   1783       DebugAssert(addr_reg.Index() != RARG3.Index());
   1784       rvAsm->SRLI64(RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT);
   1785       rvAsm->SLLI64(RARG3, RARG3, 8);
   1786       rvAsm->ADD(RARG3, RARG3, RMEMBASE);
   1787       rvAsm->LD(RARG3, 0, RARG3);
   1788       rvAsm->ADD(RSCRATCH, RSCRATCH, RARG3);
   1789     }
   1790     else
   1791     {
   1792       rvAsm->ADD(RSCRATCH, RSCRATCH, RMEMBASE);
   1793     }
   1794 
   1795     u8* start = m_emitter->GetCursorPointer();
   1796     switch (size)
   1797     {
   1798       case MemoryAccessSize::Byte:
   1799         rvAsm->SB(value_reg, 0, RSCRATCH);
   1800         break;
   1801 
   1802       case MemoryAccessSize::HalfWord:
   1803         rvAsm->SH(value_reg, 0, RSCRATCH);
   1804         break;
   1805 
   1806       case MemoryAccessSize::Word:
   1807         rvAsm->SW(value_reg, 0, RSCRATCH);
   1808         break;
   1809     }
   1810 
   1811     // We need a nop, because the slowmem jump might be more than 1MB away.
   1812     rvAsm->NOP();
   1813 
   1814     AddLoadStoreInfo(start, 8, addr_reg.Index(), value_reg.Index(), size, false, false);
   1815     return;
   1816   }
   1817 
   1818   if (addr_reg.Index() != RARG1.Index())
   1819     rvAsm->MV(RARG1, addr_reg);
   1820   if (value_reg.Index() != RARG2.Index())
   1821     rvAsm->MV(RARG2, value_reg);
   1822 
   1823   const bool checked = g_settings.cpu_recompiler_memory_exceptions;
   1824   switch (size)
   1825   {
   1826     case MemoryAccessSize::Byte:
   1827     {
   1828       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryByte) :
   1829                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
   1830     }
   1831     break;
   1832     case MemoryAccessSize::HalfWord:
   1833     {
   1834       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryHalfWord) :
   1835                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
   1836     }
   1837     break;
   1838     case MemoryAccessSize::Word:
   1839     {
   1840       EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryWord) :
   1841                          reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
   1842     }
   1843     break;
   1844   }
   1845 
   1846   // TODO: turn this into an asm function instead
   1847   if (checked)
   1848   {
   1849     SwitchToFarCode(true, &Assembler::BEQ, RRET, zero);
   1850     BackupHostState();
   1851 
   1852     // Need to stash this in a temp because of the flush.
   1853     const GPR temp = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
   1854     rvAsm->SLLIW(temp, RRET, 2);
   1855 
   1856     Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
   1857 
   1858     // cause_bits = (result << 2) | BD | cop_n
   1859     SafeORI(RARG1, temp,
   1860             Cop0Registers::CAUSE::MakeValueForException(
   1861               static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
   1862     EmitMov(RARG2, m_current_instruction_pc);
   1863     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   1864     FreeHostReg(temp.Index());
   1865     EndBlock(std::nullopt, true);
   1866 
   1867     RestoreHostState();
   1868     SwitchToNearCode(false);
   1869   }
   1870 }
   1871 
   1872 void CPU::NewRec::RISCV64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1873                                                const std::optional<VirtualMemoryAddress>& address)
   1874 {
   1875   const std::optional<GPR> addr_reg = (g_settings.gpu_pgxp_enable && cf.MipsT() != Reg::zero) ?
   1876                                         std::optional<GPR>(GPR(AllocateTempHostReg(HR_CALLEE_SAVED))) :
   1877                                         std::optional<GPR>();
   1878   FlushForLoadStore(address, false, use_fastmem);
   1879   const GPR addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   1880   const GPR data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {
   1881     if (cf.MipsT() == Reg::zero)
   1882       return RRET;
   1883 
   1884     return GPR(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   1885                                EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT()));
   1886   });
   1887 
   1888   if (g_settings.gpu_pgxp_enable && cf.MipsT() != Reg::zero)
   1889   {
   1890     Flush(FLUSH_FOR_C_CALL);
   1891 
   1892     EmitMov(RARG1, inst->bits);
   1893     rvAsm->MV(RARG2, addr);
   1894     rvAsm->MV(RARG3, data);
   1895     EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
   1896     FreeHostReg(addr_reg.value().Index());
   1897   }
   1898 }
   1899 
   1900 void CPU::NewRec::RISCV64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1901                                                const std::optional<VirtualMemoryAddress>& address)
   1902 {
   1903   DebugAssert(size == MemoryAccessSize::Word && !sign);
   1904 
   1905   const GPR addr = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
   1906   FlushForLoadStore(address, false, use_fastmem);
   1907 
   1908   // TODO: if address is constant, this can be simplified..
   1909 
   1910   // If we're coming from another block, just flush the load delay and hope for the best..
   1911   if (m_load_delay_dirty)
   1912     UpdateLoadDelay();
   1913 
   1914   // We'd need to be careful here if we weren't overwriting it..
   1915   ComputeLoadStoreAddressArg(cf, address, addr);
   1916   rvAsm->ANDI(RARG1, addr, ~0x3u);
   1917   GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
   1918 
   1919   if (inst->r.rt == Reg::zero)
   1920   {
   1921     FreeHostReg(addr.Index());
   1922     return;
   1923   }
   1924 
   1925   // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
   1926   // never written back. NOTE: can't trust T in cf because of the flush
   1927   const Reg rt = inst->r.rt;
   1928   GPR value;
   1929   if (m_load_delay_register == rt)
   1930   {
   1931     const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
   1932                                  AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
   1933                                  m_load_delay_value_register;
   1934     RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
   1935     value = GPR(existing_ld_rt);
   1936   }
   1937   else
   1938   {
   1939     if constexpr (EMULATE_LOAD_DELAYS)
   1940     {
   1941       value = GPR(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
   1942       if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
   1943         rvAsm->MV(value, GPR(rtreg.value()));
   1944       else if (HasConstantReg(rt))
   1945         EmitMov(value, GetConstantRegU32(rt));
   1946       else
   1947         rvAsm->LW(value, PTR(&g_state.regs.r[static_cast<u8>(rt)]));
   1948     }
   1949     else
   1950     {
   1951       value = GPR(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
   1952     }
   1953   }
   1954 
   1955   DebugAssert(value.Index() != RARG2.Index() && value.Index() != RARG3.Index());
   1956   rvAsm->ANDI(RARG2, addr, 3);
   1957   rvAsm->SLLIW(RARG2, RARG2, 3); // *8
   1958   EmitMov(RARG3, 24);
   1959   rvAsm->SUBW(RARG3, RARG3, RARG2);
   1960 
   1961   if (inst->op == InstructionOp::lwl)
   1962   {
   1963     // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
   1964     // new_value = (value & mask) | (RWRET << (24 - shift));
   1965     EmitMov(RSCRATCH, 0xFFFFFFu);
   1966     rvAsm->SRLW(RSCRATCH, RSCRATCH, RARG2);
   1967     rvAsm->AND(value, value, RSCRATCH);
   1968     rvAsm->SLLW(RRET, RRET, RARG3);
   1969     rvAsm->OR(value, value, RRET);
   1970   }
   1971   else
   1972   {
   1973     // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
   1974     // new_value = (value & mask) | (RWRET >> shift);
   1975     rvAsm->SRLW(RRET, RRET, RARG2);
   1976     EmitMov(RSCRATCH, 0xFFFFFF00u);
   1977     rvAsm->SLLW(RSCRATCH, RSCRATCH, RARG3);
   1978     rvAsm->AND(value, value, RSCRATCH);
   1979     rvAsm->OR(value, value, RRET);
   1980   }
   1981 
   1982   FreeHostReg(addr.Index());
   1983 
   1984   if (g_settings.gpu_pgxp_enable)
   1985   {
   1986     Flush(FLUSH_FOR_C_CALL);
   1987     rvAsm->MV(RARG3, value);
   1988     rvAsm->ANDI(RARG2, addr, ~0x3u);
   1989     EmitMov(RARG1, inst->bits);
   1990     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
   1991   }
   1992 }
   1993 
   1994 void CPU::NewRec::RISCV64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   1995                                                 const std::optional<VirtualMemoryAddress>& address)
   1996 {
   1997   const u32 index = static_cast<u32>(inst->r.rt.GetValue());
   1998   const auto [ptr, action] = GetGTERegisterPointer(index, true);
   1999   const std::optional<GPR> addr_reg =
   2000     g_settings.gpu_pgxp_enable ? std::optional<GPR>(GPR(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional<GPR>();
   2001   FlushForLoadStore(address, false, use_fastmem);
   2002   const GPR addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   2003   const GPR value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
   2004     return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
   2005              GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) :
   2006              RRET;
   2007   });
   2008 
   2009   switch (action)
   2010   {
   2011     case GTERegisterAccessAction::Ignore:
   2012     {
   2013       break;
   2014     }
   2015 
   2016     case GTERegisterAccessAction::Direct:
   2017     {
   2018       rvAsm->SW(value, PTR(ptr));
   2019       break;
   2020     }
   2021 
   2022     case GTERegisterAccessAction::SignExtend16:
   2023     {
   2024       EmitSExtH(RARG3, value);
   2025       rvAsm->SW(RARG3, PTR(ptr));
   2026       break;
   2027     }
   2028 
   2029     case GTERegisterAccessAction::ZeroExtend16:
   2030     {
   2031       EmitUExtH(RARG3, value);
   2032       rvAsm->SW(RARG3, PTR(ptr));
   2033       break;
   2034     }
   2035 
   2036     case GTERegisterAccessAction::CallHandler:
   2037     {
   2038       Flush(FLUSH_FOR_C_CALL);
   2039       rvAsm->MV(RARG2, value);
   2040       EmitMov(RARG1, index);
   2041       EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
   2042       break;
   2043     }
   2044 
   2045     case GTERegisterAccessAction::PushFIFO:
   2046     {
   2047       // SXY0 <- SXY1
   2048       // SXY1 <- SXY2
   2049       // SXY2 <- SXYP
   2050       DebugAssert(value.Index() != RARG2.Index() && value.Index() != RARG3.Index());
   2051       rvAsm->LW(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
   2052       rvAsm->LW(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
   2053       rvAsm->SW(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
   2054       rvAsm->SW(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
   2055       rvAsm->SW(value, PTR(&g_state.gte_regs.SXY2[0]));
   2056       break;
   2057     }
   2058 
   2059     default:
   2060     {
   2061       Panic("Unknown action");
   2062       return;
   2063     }
   2064   }
   2065 
   2066   if (g_settings.gpu_pgxp_enable)
   2067   {
   2068     Flush(FLUSH_FOR_C_CALL);
   2069     rvAsm->MV(RARG3, value);
   2070     if (value.Index() != RRET.Index())
   2071       FreeHostReg(value.Index());
   2072     rvAsm->MV(RARG2, addr);
   2073     FreeHostReg(addr_reg.value().Index());
   2074     EmitMov(RARG1, inst->bits);
   2075     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
   2076   }
   2077 }
   2078 
   2079 void CPU::NewRec::RISCV64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   2080                                                const std::optional<VirtualMemoryAddress>& address)
   2081 {
   2082   AssertRegOrConstS(cf);
   2083   AssertRegOrConstT(cf);
   2084 
   2085   const std::optional<GPR> addr_reg =
   2086     g_settings.gpu_pgxp_enable ? std::optional<GPR>(GPR(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional<GPR>();
   2087   FlushForLoadStore(address, true, use_fastmem);
   2088   const GPR addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
   2089   const GPR data = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
   2090   if (!cf.valid_host_t)
   2091     MoveTToReg(RARG2, cf);
   2092 
   2093   GenerateStore(addr, data, size, use_fastmem);
   2094 
   2095   if (g_settings.gpu_pgxp_enable)
   2096   {
   2097     Flush(FLUSH_FOR_C_CALL);
   2098     MoveMIPSRegToReg(RARG3, cf.MipsT());
   2099     rvAsm->MV(RARG2, addr);
   2100     EmitMov(RARG1, inst->bits);
   2101     EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
   2102     FreeHostReg(addr_reg.value().Index());
   2103   }
   2104 }
   2105 
   2106 void CPU::NewRec::RISCV64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   2107                                                const std::optional<VirtualMemoryAddress>& address)
   2108 {
   2109   DebugAssert(size == MemoryAccessSize::Word && !sign);
   2110 
   2111   // TODO: this can take over rt's value if it's no longer needed
   2112   // NOTE: can't trust T in cf because of the alloc
   2113   const GPR addr = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
   2114   const GPR value = g_settings.gpu_pgxp_enable ? GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
   2115   if (g_settings.gpu_pgxp_enable)
   2116     MoveMIPSRegToReg(value, inst->r.rt);
   2117 
   2118   FlushForLoadStore(address, true, use_fastmem);
   2119 
   2120   // TODO: if address is constant, this can be simplified..
   2121   // We'd need to be careful here if we weren't overwriting it..
   2122   ComputeLoadStoreAddressArg(cf, address, addr);
   2123   rvAsm->ANDI(RARG1, addr, ~0x3u);
   2124   GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
   2125 
   2126   rvAsm->ANDI(RSCRATCH, addr, 3);
   2127   rvAsm->SLLIW(RSCRATCH, RSCRATCH, 3); // *8
   2128   rvAsm->ANDI(addr, addr, ~0x3u);
   2129 
   2130   // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
   2131   if (!g_settings.gpu_pgxp_enable)
   2132     MoveMIPSRegToReg(value, inst->r.rt);
   2133 
   2134   if (inst->op == InstructionOp::swl)
   2135   {
   2136     // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
   2137     // new_value = (RWRET & mem_mask) | (value >> (24 - shift));
   2138     EmitMov(RARG3, 0xFFFFFF00u);
   2139     rvAsm->SLLW(RARG3, RARG3, RSCRATCH);
   2140     rvAsm->AND(RRET, RRET, RARG3);
   2141 
   2142     EmitMov(RARG3, 24);
   2143     rvAsm->SUBW(RARG3, RARG3, RSCRATCH);
   2144     rvAsm->SRLW(value, value, RARG3);
   2145     rvAsm->OR(value, value, RRET);
   2146   }
   2147   else
   2148   {
   2149     // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
   2150     // new_value = (RWRET & mem_mask) | (value << shift);
   2151     rvAsm->SLLW(value, value, RSCRATCH);
   2152 
   2153     EmitMov(RARG3, 24);
   2154     rvAsm->SUBW(RARG3, RARG3, RSCRATCH);
   2155     EmitMov(RSCRATCH, 0x00FFFFFFu);
   2156     rvAsm->SRLW(RSCRATCH, RSCRATCH, RARG3);
   2157     rvAsm->AND(RRET, RRET, RSCRATCH);
   2158     rvAsm->OR(value, value, RRET);
   2159   }
   2160 
   2161   if (!g_settings.gpu_pgxp_enable)
   2162   {
   2163     GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
   2164     FreeHostReg(addr.Index());
   2165   }
   2166   else
   2167   {
   2168     GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
   2169 
   2170     Flush(FLUSH_FOR_C_CALL);
   2171     rvAsm->MV(RARG3, value);
   2172     FreeHostReg(value.Index());
   2173     rvAsm->MV(RARG2, addr);
   2174     FreeHostReg(addr.Index());
   2175     EmitMov(RARG1, inst->bits);
   2176     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
   2177   }
   2178 }
   2179 
   2180 void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
   2181                                                 const std::optional<VirtualMemoryAddress>& address)
   2182 {
   2183   const u32 index = static_cast<u32>(inst->r.rt.GetValue());
   2184   const auto [ptr, action] = GetGTERegisterPointer(index, false);
   2185   const GPR addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
   2186                      GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) :
   2187                      RARG1;
   2188   const GPR data = g_settings.gpu_pgxp_enable ? GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
   2189   FlushForLoadStore(address, true, use_fastmem);
   2190   ComputeLoadStoreAddressArg(cf, address, addr);
   2191 
   2192   switch (action)
   2193   {
   2194     case GTERegisterAccessAction::Direct:
   2195     {
   2196       rvAsm->LW(data, PTR(ptr));
   2197     }
   2198     break;
   2199 
   2200     case GTERegisterAccessAction::CallHandler:
   2201     {
   2202       // should already be flushed.. except in fastmem case
   2203       Flush(FLUSH_FOR_C_CALL);
   2204       EmitMov(RARG1, index);
   2205       EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
   2206       rvAsm->MV(data, RRET);
   2207     }
   2208     break;
   2209 
   2210     default:
   2211     {
   2212       Panic("Unknown action");
   2213     }
   2214     break;
   2215   }
   2216 
   2217   GenerateStore(addr, data, size, use_fastmem);
   2218 
   2219   if (!g_settings.gpu_pgxp_enable)
   2220   {
   2221     if (addr.Index() != RARG1.Index())
   2222       FreeHostReg(addr.Index());
   2223   }
   2224   else
   2225   {
   2226     // TODO: This can be simplified because we don't need to validate in PGXP..
   2227     Flush(FLUSH_FOR_C_CALL);
   2228     rvAsm->MV(RARG3, data);
   2229     FreeHostReg(data.Index());
   2230     rvAsm->MV(RARG2, addr);
   2231     FreeHostReg(addr.Index());
   2232     EmitMov(RARG1, inst->bits);
   2233     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
   2234   }
   2235 }
   2236 
   2237 void CPU::NewRec::RISCV64Compiler::Compile_mtc0(CompileFlags cf)
   2238 {
   2239   // TODO: we need better constant setting here.. which will need backprop
   2240   AssertRegOrConstT(cf);
   2241 
   2242   const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
   2243   const u32* ptr = GetCop0RegPtr(reg);
   2244   const u32 mask = GetCop0RegWriteMask(reg);
   2245   if (!ptr)
   2246   {
   2247     Compile_Fallback();
   2248     return;
   2249   }
   2250 
   2251   if (mask == 0)
   2252   {
   2253     // if it's a read-only register, ignore
   2254     DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
   2255     return;
   2256   }
   2257 
   2258   // for some registers, we need to test certain bits
   2259   const bool needs_bit_test = (reg == Cop0Reg::SR);
   2260   const GPR new_value = RARG1;
   2261   const GPR old_value = RARG2;
   2262   const GPR changed_bits = RARG3;
   2263   const GPR mask_reg = RSCRATCH;
   2264 
   2265   // Load old value
   2266   rvAsm->LW(old_value, PTR(ptr));
   2267 
   2268   // No way we fit this in an immediate..
   2269   EmitMov(mask_reg, mask);
   2270 
   2271   // update value
   2272   // TODO: This is creating pointless MV instructions.. why?
   2273   if (cf.valid_host_t)
   2274     rvAsm->AND(new_value, CFGetRegT(cf), mask_reg);
   2275   else
   2276     EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
   2277 
   2278   if (needs_bit_test)
   2279     rvAsm->XOR(changed_bits, old_value, new_value);
   2280   rvAsm->NOT(mask_reg, mask_reg);
   2281   rvAsm->AND(old_value, old_value, mask_reg);
   2282   rvAsm->OR(new_value, old_value, new_value);
   2283   rvAsm->SW(new_value, PTR(ptr));
   2284 
   2285   if (reg == Cop0Reg::SR)
   2286   {
   2287     // TODO: replace with register backup
   2288     // We could just inline the whole thing..
   2289     Flush(FLUSH_FOR_C_CALL);
   2290 
   2291     rvAsm->SRLIW(RSCRATCH, changed_bits, 16);
   2292     rvAsm->ANDI(RSCRATCH, RSCRATCH, 1);
   2293     SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero);
   2294     rvAsm->ADDI(sp, sp, -16);
   2295     rvAsm->SW(RARG1, 0, sp);
   2296     EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
   2297     rvAsm->LW(RARG1, 0, sp);
   2298     rvAsm->ADDI(sp, sp, 16);
   2299     rvAsm->LD(RMEMBASE, PTR(&g_state.fastmem_base));
   2300     SwitchToNearCode(true);
   2301 
   2302     TestInterrupts(RARG1);
   2303   }
   2304   else if (reg == Cop0Reg::CAUSE)
   2305   {
   2306     rvAsm->LW(RARG1, PTR(&g_state.cop0_regs.sr.bits));
   2307     TestInterrupts(RARG1);
   2308   }
   2309 
   2310   if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions)
   2311   {
   2312     // TODO: DCIC handling for debug breakpoints
   2313     WARNING_LOG("TODO: DCIC handling for debug breakpoints");
   2314   }
   2315 }
   2316 
   2317 void CPU::NewRec::RISCV64Compiler::Compile_rfe(CompileFlags cf)
   2318 {
   2319   // shift mode bits right two, preserving upper bits
   2320   rvAsm->LW(RARG1, PTR(&g_state.cop0_regs.sr.bits));
   2321   rvAsm->SRLIW(RSCRATCH, RARG1, 2);
   2322   rvAsm->ANDI(RSCRATCH, RSCRATCH, 0xf);
   2323   rvAsm->ANDI(RARG1, RARG1, ~0xfu);
   2324   rvAsm->OR(RARG1, RARG1, RSCRATCH);
   2325   rvAsm->SW(RARG1, PTR(&g_state.cop0_regs.sr.bits));
   2326 
   2327   TestInterrupts(RARG1);
   2328 }
   2329 
   2330 void CPU::NewRec::RISCV64Compiler::TestInterrupts(const biscuit::GPR& sr)
   2331 {
   2332   DebugAssert(sr != RSCRATCH);
   2333 
   2334   // if Iec == 0 then goto no_interrupt
   2335   Label no_interrupt;
   2336   rvAsm->ANDI(RSCRATCH, sr, 1);
   2337   rvAsm->BEQZ(RSCRATCH, &no_interrupt);
   2338 
   2339   // sr & cause
   2340   rvAsm->LW(RSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
   2341   rvAsm->AND(sr, sr, RSCRATCH);
   2342 
   2343   // ((sr & cause) & 0xff00) == 0 goto no_interrupt
   2344   rvAsm->SRLIW(sr, sr, 8);
   2345   rvAsm->ANDI(sr, sr, 0xFF);
   2346   SwitchToFarCode(true, &Assembler::BEQ, sr, zero);
   2347 
   2348   BackupHostState();
   2349 
   2350   // Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
   2351   UpdateLoadDelay();
   2352 
   2353   Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
   2354 
   2355   // Can't use EndBlockWithException() here, because it'll use the wrong PC.
   2356   // Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
   2357   if (!iinfo->is_last_instruction)
   2358   {
   2359     EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
   2360                                                                (inst + 1)->cop.cop_n));
   2361     EmitMov(RARG2, m_compiler_pc);
   2362     EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
   2363     m_dirty_pc = false;
   2364     EndAndLinkBlock(std::nullopt, true, false);
   2365   }
   2366   else
   2367   {
   2368     if (m_dirty_pc)
   2369       EmitMov(RARG1, m_compiler_pc);
   2370     rvAsm->SW(biscuit::zero, PTR(&g_state.downcount));
   2371     if (m_dirty_pc)
   2372       rvAsm->SW(RARG1, PTR(&g_state.pc));
   2373     m_dirty_pc = false;
   2374     EndAndLinkBlock(std::nullopt, false, true);
   2375   }
   2376 
   2377   RestoreHostState();
   2378   SwitchToNearCode(false);
   2379 
   2380   rvAsm->Bind(&no_interrupt);
   2381 }
   2382 
   2383 void CPU::NewRec::RISCV64Compiler::Compile_mfc2(CompileFlags cf)
   2384 {
   2385   const u32 index = inst->cop.Cop2Index();
   2386   const Reg rt = inst->r.rt;
   2387 
   2388   const auto [ptr, action] = GetGTERegisterPointer(index, false);
   2389   if (action == GTERegisterAccessAction::Ignore)
   2390     return;
   2391 
   2392   u32 hreg;
   2393   if (action == GTERegisterAccessAction::Direct)
   2394   {
   2395     hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   2396                            EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
   2397     rvAsm->LW(GPR(hreg), PTR(ptr));
   2398   }
   2399   else if (action == GTERegisterAccessAction::CallHandler)
   2400   {
   2401     Flush(FLUSH_FOR_C_CALL);
   2402     EmitMov(RARG1, index);
   2403     EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
   2404 
   2405     hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
   2406                            EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
   2407     rvAsm->MV(GPR(hreg), RRET);
   2408   }
   2409   else
   2410   {
   2411     Panic("Unknown action");
   2412   }
   2413 
   2414   if (g_settings.gpu_pgxp_enable)
   2415   {
   2416     Flush(FLUSH_FOR_C_CALL);
   2417     EmitMov(RARG1, inst->bits);
   2418     rvAsm->MV(RARG2, GPR(hreg));
   2419     EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
   2420   }
   2421 }
   2422 
   2423 void CPU::NewRec::RISCV64Compiler::Compile_mtc2(CompileFlags cf)
   2424 {
   2425   const u32 index = inst->cop.Cop2Index();
   2426   const auto [ptr, action] = GetGTERegisterPointer(index, true);
   2427   if (action == GTERegisterAccessAction::Ignore)
   2428     return;
   2429 
   2430   if (action == GTERegisterAccessAction::Direct)
   2431   {
   2432     if (cf.const_t)
   2433       StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
   2434     else
   2435       rvAsm->SW(CFGetRegT(cf), PTR(ptr));
   2436   }
   2437   else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
   2438   {
   2439     const bool sign = (action == GTERegisterAccessAction::SignExtend16);
   2440     if (cf.valid_host_t)
   2441     {
   2442       sign ? EmitSExtH(RARG1, CFGetRegT(cf)) : EmitUExtH(RARG1, CFGetRegT(cf));
   2443       rvAsm->SW(RARG1, PTR(ptr));
   2444     }
   2445     else if (cf.const_t)
   2446     {
   2447       const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
   2448       StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
   2449     }
   2450     else
   2451     {
   2452       Panic("Unsupported setup");
   2453     }
   2454   }
   2455   else if (action == GTERegisterAccessAction::CallHandler)
   2456   {
   2457     Flush(FLUSH_FOR_C_CALL);
   2458     EmitMov(RARG1, index);
   2459     MoveTToReg(RARG2, cf);
   2460     EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
   2461   }
   2462   else if (action == GTERegisterAccessAction::PushFIFO)
   2463   {
   2464     // SXY0 <- SXY1
   2465     // SXY1 <- SXY2
   2466     // SXY2 <- SXYP
   2467     DebugAssert(RRET.Index() != RARG2.Index() && RRET.Index() != RARG3.Index());
   2468     rvAsm->LW(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
   2469     rvAsm->LW(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
   2470     rvAsm->SW(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
   2471     rvAsm->SW(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
   2472     if (cf.valid_host_t)
   2473       rvAsm->SW(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
   2474     else if (cf.const_t)
   2475       StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
   2476     else
   2477       Panic("Unsupported setup");
   2478   }
   2479   else
   2480   {
   2481     Panic("Unknown action");
   2482   }
   2483 }
   2484 
   2485 void CPU::NewRec::RISCV64Compiler::Compile_cop2(CompileFlags cf)
   2486 {
   2487   TickCount func_ticks;
   2488   GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
   2489 
   2490   Flush(FLUSH_FOR_C_CALL);
   2491   EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
   2492   EmitCall(reinterpret_cast<const void*>(func));
   2493 
   2494   AddGTETicks(func_ticks);
   2495 }
   2496 
   2497 u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
   2498                                        TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
   2499                                        u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
   2500                                        bool is_load)
   2501 {
   2502   Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
   2503   Assembler* rvAsm = &arm_asm;
   2504 
   2505   static constexpr u32 GPR_SIZE = 8;
   2506 
   2507   // save regs
   2508   u32 num_gprs = 0;
   2509 
   2510   for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2511   {
   2512     if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i))
   2513       num_gprs++;
   2514   }
   2515 
   2516   const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
   2517 
   2518   if (stack_size > 0)
   2519   {
   2520     rvAsm->ADDI(sp, sp, -static_cast<s32>(stack_size));
   2521 
   2522     u32 stack_offset = 0;
   2523     for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2524     {
   2525       if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i))
   2526       {
   2527         rvAsm->SD(GPR(i), stack_offset, sp);
   2528         stack_offset += GPR_SIZE;
   2529       }
   2530     }
   2531   }
   2532 
   2533   if (cycles_to_add != 0)
   2534   {
   2535     // NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
   2536     Assert(rvIsValidSExtITypeImm(cycles_to_add));
   2537     rvAsm->LW(RSCRATCH, PTR(&g_state.pending_ticks));
   2538     rvAsm->ADDIW(RSCRATCH, RSCRATCH, cycles_to_add);
   2539     rvAsm->SW(RSCRATCH, PTR(&g_state.pending_ticks));
   2540   }
   2541 
   2542   if (address_register != RARG1.Index())
   2543     rvAsm->MV(RARG1, GPR(address_register));
   2544 
   2545   if (!is_load)
   2546   {
   2547     if (data_register != RARG2.Index())
   2548       rvAsm->MV(RARG2, GPR(data_register));
   2549   }
   2550 
   2551   switch (size)
   2552   {
   2553     case MemoryAccessSize::Byte:
   2554     {
   2555       rvEmitCall(rvAsm, is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte) :
   2556                                   reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
   2557     }
   2558     break;
   2559     case MemoryAccessSize::HalfWord:
   2560     {
   2561       rvEmitCall(rvAsm, is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) :
   2562                                   reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
   2563     }
   2564     break;
   2565     case MemoryAccessSize::Word:
   2566     {
   2567       rvEmitCall(rvAsm, is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord) :
   2568                                   reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
   2569     }
   2570     break;
   2571   }
   2572 
   2573   if (is_load)
   2574   {
   2575     const GPR dst = GPR(data_register);
   2576     switch (size)
   2577     {
   2578       case MemoryAccessSize::Byte:
   2579       {
   2580         is_signed ? rvEmitSExtB(rvAsm, dst, RRET) : rvEmitUExtB(rvAsm, dst, RRET);
   2581       }
   2582       break;
   2583       case MemoryAccessSize::HalfWord:
   2584       {
   2585         is_signed ? rvEmitSExtH(rvAsm, dst, RRET) : rvEmitUExtH(rvAsm, dst, RRET);
   2586       }
   2587       break;
   2588       case MemoryAccessSize::Word:
   2589       {
   2590         if (dst.Index() != RRET.Index())
   2591           rvAsm->MV(dst, RRET);
   2592       }
   2593       break;
   2594     }
   2595   }
   2596 
   2597   if (cycles_to_remove != 0)
   2598   {
   2599     Assert(rvIsValidSExtITypeImm(-cycles_to_remove));
   2600     rvAsm->LW(RSCRATCH, PTR(&g_state.pending_ticks));
   2601     rvAsm->ADDIW(RSCRATCH, RSCRATCH, -cycles_to_remove);
   2602     rvAsm->SW(RSCRATCH, PTR(&g_state.pending_ticks));
   2603   }
   2604 
   2605   // restore regs
   2606   if (stack_size > 0)
   2607   {
   2608     u32 stack_offset = 0;
   2609     for (u32 i = 0; i < NUM_HOST_REGS; i++)
   2610     {
   2611       if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i))
   2612       {
   2613         rvAsm->LD(GPR(i), stack_offset, sp);
   2614         stack_offset += GPR_SIZE;
   2615       }
   2616     }
   2617 
   2618     rvAsm->ADDI(sp, sp, stack_size);
   2619   }
   2620 
   2621   rvEmitJmp(rvAsm, static_cast<const u8*>(code_address) + code_size);
   2622 
   2623   return static_cast<u32>(rvAsm->GetCodeBuffer().GetSizeInBytes());
   2624 }
   2625 
   2626 #endif // CPU_ARCH_RISCV64