duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

cpu_recompiler_code_generator_aarch32.cpp (67558B)


      1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "common/align.h"
      5 #include "common/assert.h"
      6 #include "common/log.h"
      7 #include "common/memmap.h"
      8 
      9 #include "cpu_code_cache_private.h"
     10 #include "cpu_core.h"
     11 #include "cpu_core_private.h"
     12 #include "cpu_recompiler_code_generator.h"
     13 #include "cpu_recompiler_thunks.h"
     14 #include "settings.h"
     15 #include "timing_event.h"
     16 
     17 #ifdef CPU_ARCH_ARM32
     18 
     19 Log_SetChannel(CPU::Recompiler);
     20 
     21 #ifdef ENABLE_HOST_DISASSEMBLY
     22 #include "vixl/aarch32/disasm-aarch32.h"
     23 #include <iostream>
     24 #endif
     25 
     26 namespace a32 = vixl::aarch32;
     27 
     28 namespace CPU::Recompiler {
     29 constexpr u32 FUNCTION_CALLEE_SAVED_SPACE_RESERVE = 80;  // 8 registers
     30 constexpr u32 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 bytes
     31 constexpr u32 FUNCTION_STACK_SIZE = FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE;
     32 
     33 static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;
     34 static std::unordered_map<const void*, u32> s_trampoline_targets;
     35 static u8* s_trampoline_start_ptr = nullptr;
     36 static u32 s_trampoline_used = 0;
     37 } // namespace CPU::Recompiler
     38 
     39 bool CPU::Recompiler::armIsCallerSavedRegister(u32 id)
     40 {
     41   return ((id >= 0 && id <= 3) ||  // r0-r3
     42           (id == 12 || id == 14)); // sp, pc
     43 }
     44 
     45 s32 CPU::Recompiler::armGetPCDisplacement(const void* current, const void* target)
     46 {
     47   Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
     48   Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
     49   return static_cast<s32>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)));
     50 }
     51 
     52 bool CPU::Recompiler::armIsPCDisplacementInImmediateRange(s32 displacement)
     53 {
     54   return (displacement >= -33554432 && displacement <= 33554428);
     55 }
     56 
     57 void CPU::Recompiler::armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm)
     58 {
     59   if (vixl::IsUintN(16, imm))
     60   {
     61     armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);
     62     return;
     63   }
     64 
     65   armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);
     66   armAsm->movt(vixl::aarch32::al, rd, imm >> 16);
     67 }
     68 
     69 void CPU::Recompiler::armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
     70                                           const void* addr)
     71 {
     72   armEmitMov(armAsm, reg, static_cast<u32>(reinterpret_cast<uintptr_t>(addr)));
     73 }
     74 
     75 void CPU::Recompiler::armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)
     76 {
     77   const void* cur = armAsm->GetCursorAddress<const void*>();
     78   s32 displacement = armGetPCDisplacement(cur, ptr);
     79   bool use_bx = !armIsPCDisplacementInImmediateRange(displacement);
     80   if (use_bx && !force_inline)
     81   {
     82     if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
     83     {
     84       displacement = armGetPCDisplacement(cur, trampoline);
     85       use_bx = !armIsPCDisplacementInImmediateRange(displacement);
     86     }
     87   }
     88 
     89   if (use_bx)
     90   {
     91     armMoveAddressToReg(armAsm, RSCRATCH, ptr);
     92     armAsm->bx(RSCRATCH);
     93   }
     94   else
     95   {
     96     a32::Label label(displacement + armAsm->GetCursorOffset());
     97     armAsm->b(&label);
     98   }
     99 }
    100 
    101 void CPU::Recompiler::armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)
    102 {
    103   const void* cur = armAsm->GetCursorAddress<const void*>();
    104   s32 displacement = armGetPCDisplacement(cur, ptr);
    105   bool use_blx = !armIsPCDisplacementInImmediateRange(displacement);
    106   if (use_blx && !force_inline)
    107   {
    108     if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
    109     {
    110       displacement = armGetPCDisplacement(cur, trampoline);
    111       use_blx = !armIsPCDisplacementInImmediateRange(displacement);
    112     }
    113   }
    114 
    115   if (use_blx)
    116   {
    117     armMoveAddressToReg(armAsm, RSCRATCH, ptr);
    118     armAsm->blx(RSCRATCH);
    119   }
    120   else
    121   {
    122     a32::Label label(displacement + armAsm->GetCursorOffset());
    123     armAsm->bl(&label);
    124   }
    125 }
    126 
    127 void CPU::Recompiler::armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl::aarch32::Condition cond,
    128                                         const void* ptr)
    129 {
    130   const s32 displacement = armGetPCDisplacement(armAsm->GetCursorAddress<const void*>(), ptr);
    131   if (!armIsPCDisplacementInImmediateRange(displacement))
    132   {
    133     armMoveAddressToReg(armAsm, RSCRATCH, ptr);
    134     armAsm->blx(cond, RSCRATCH);
    135   }
    136   else
    137   {
    138     a32::Label label(displacement + armAsm->GetCursorOffset());
    139     armAsm->b(cond, &label);
    140   }
    141 }
    142 
    143 void CPU::Recompiler::armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
    144                                      const void* addr)
    145 {
    146   armMoveAddressToReg(armAsm, reg, addr);
    147   armAsm->ldr(reg, vixl::aarch32::MemOperand(reg));
    148 }
    149 
    150 void CPU::Recompiler::armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
    151                                       const void* addr, const vixl::aarch32::Register& tempreg)
    152 {
    153   armMoveAddressToReg(armAsm, tempreg, addr);
    154   armAsm->str(reg, vixl::aarch32::MemOperand(tempreg));
    155 }
    156 
    157 void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
    158 {
    159 #ifdef ENABLE_HOST_DISASSEMBLY
    160   a32::PrintDisassembler dis(std::cout, 0);
    161   dis.SetCodeAddress(reinterpret_cast<uintptr_t>(start));
    162   dis.DisassembleA32Buffer(static_cast<const u32*>(start), size);
    163 #else
    164   ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
    165 #endif
    166 }
    167 
    168 u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
    169 {
    170   return size / a32::kA32InstructionSizeInBytes;
    171 }
    172 
    173 u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
    174 {
    175   using namespace vixl::aarch32;
    176   using namespace CPU::Recompiler;
    177 
    178   const s32 disp = armGetPCDisplacement(code, dst);
    179   DebugAssert(armIsPCDisplacementInImmediateRange(disp));
    180 
    181   // A32 jumps are silly.
    182   {
    183     vixl::aarch32::Assembler emit(static_cast<vixl::byte*>(code), kA32InstructionSizeInBytes, a32::A32);
    184     a32::Label label(disp);
    185     emit.b(&label);
    186   }
    187 
    188   if (flush_icache)
    189     MemMap::FlushInstructionCache(code, kA32InstructionSizeInBytes);
    190 
    191   return kA32InstructionSizeInBytes;
    192 }
    193 
    194 u8* CPU::Recompiler::armGetJumpTrampoline(const void* target)
    195 {
    196   auto it = s_trampoline_targets.find(target);
    197   if (it != s_trampoline_targets.end())
    198     return s_trampoline_start_ptr + it->second;
    199 
    200   // align to 16 bytes?
    201   const u32 offset = s_trampoline_used; // Common::AlignUpPow2(s_trampoline_used, 16);
    202 
    203   // 4 movs plus a jump
    204   if (TRAMPOLINE_AREA_SIZE - offset < 20)
    205   {
    206     Panic("Ran out of space in constant pool");
    207     return nullptr;
    208   }
    209 
    210   u8* start = s_trampoline_start_ptr + offset;
    211   a32::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);
    212   armMoveAddressToReg(&armAsm, RSCRATCH, target);
    213   armAsm.bx(RSCRATCH);
    214 
    215   const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());
    216   DebugAssert(size < 20);
    217   s_trampoline_targets.emplace(target, offset);
    218   s_trampoline_used = offset + static_cast<u32>(size);
    219 
    220   MemMap::FlushInstructionCache(start, size);
    221   return start;
    222 }
    223 
    224 u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
    225 {
    226   using namespace vixl::aarch32;
    227   using namespace CPU::Recompiler;
    228 
    229 #define PTR(x) a32::MemOperand(RSTATE, (s32)(((u8*)(x)) - ((u8*)&g_state)))
    230 
    231   Assembler actual_asm(static_cast<u8*>(code), code_size);
    232   Assembler* armAsm = &actual_asm;
    233 
    234 #ifdef VIXL_DEBUG
    235   vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
    236 #endif
    237 
    238   Label dispatch;
    239 
    240   g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();
    241   {
    242     // reserve some space for saving caller-saved registers
    243     armAsm->sub(sp, sp, FUNCTION_STACK_SIZE);
    244 
    245     // Need the CPU state for basically everything :-)
    246     armMoveAddressToReg(armAsm, RSTATE, &g_state);
    247   }
    248 
    249   // check events then for frame done
    250   g_check_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
    251   {
    252     Label skip_event_check;
    253     armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
    254     armAsm->ldr(RARG2, PTR(&g_state.downcount));
    255     armAsm->cmp(RARG1, RARG2);
    256     armAsm->b(lt, &skip_event_check);
    257 
    258     g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
    259     armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
    260 
    261     armAsm->bind(&skip_event_check);
    262   }
    263 
    264   // TODO: align?
    265   g_dispatcher = armAsm->GetCursorAddress<const void*>();
    266   {
    267     armAsm->bind(&dispatch);
    268 
    269     // x9 <- s_fast_map[pc >> 16]
    270     armAsm->ldr(RARG1, PTR(&g_state.pc));
    271     armMoveAddressToReg(armAsm, RARG3, g_code_lut.data());
    272     armAsm->lsr(RARG2, RARG1, 16);
    273     armAsm->ldr(RARG2, MemOperand(RARG3, RARG2, LSL, 2));
    274 
    275     // blr(x9[pc * 2]) (fast_map[pc >> 2])
    276     armAsm->ldr(RARG1, MemOperand(RARG2, RARG1));
    277     armAsm->blx(RARG1);
    278   }
    279 
    280   g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
    281   {
    282     armAsm->ldr(RARG1, PTR(&g_state.pc));
    283     armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);
    284     armAsm->b(&dispatch);
    285   }
    286 
    287   g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
    288   {
    289     armAsm->ldr(RARG1, PTR(&g_state.pc));
    290     armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);
    291     armAsm->b(&dispatch);
    292   }
    293 
    294   g_interpret_block = armAsm->GetCursorAddress<const void*>();
    295   {
    296     armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
    297     armAsm->b(&dispatch);
    298   }
    299 
    300   armAsm->FinalizeCode();
    301 
    302 #if 0
    303   // TODO: align?
    304   s_trampoline_targets.clear();
    305   s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
    306   s_trampoline_used = 0;
    307 #endif
    308 
    309 #undef PTR
    310   return static_cast<u32>(armAsm->GetCursorOffset()) /* + TRAMPOLINE_AREA_SIZE*/;
    311 }
    312 
    313 // Macros aren't used with old-rec.
    314 #undef RRET
    315 #undef RARG1
    316 #undef RARG2
    317 #undef RARG3
    318 #undef RSCRATCH
    319 #undef RSTATE
    320 
    321 namespace CPU::Recompiler {
    322 
    323 constexpr HostReg RCPUPTR = 4;
    324 constexpr HostReg RMEMBASEPTR = 3;
    325 constexpr HostReg RRETURN = 0;
    326 constexpr HostReg RARG1 = 0;
    327 constexpr HostReg RARG2 = 1;
    328 constexpr HostReg RARG3 = 2;
    329 constexpr HostReg RARG4 = 3;
    330 constexpr HostReg RSCRATCH = 12;
    331 
    332 static const a32::Register GetHostReg8(HostReg reg)
    333 {
    334   return a32::Register(reg);
    335 }
    336 
    337 static const a32::Register GetHostReg8(const Value& value)
    338 {
    339   DebugAssert(value.size == RegSize_8 && value.IsInHostRegister());
    340   return a32::Register(value.host_reg);
    341 }
    342 
    343 static const a32::Register GetHostReg16(HostReg reg)
    344 {
    345   return a32::Register(reg);
    346 }
    347 
    348 static const a32::Register GetHostReg16(const Value& value)
    349 {
    350   DebugAssert(value.size == RegSize_16 && value.IsInHostRegister());
    351   return a32::Register(value.host_reg);
    352 }
    353 
    354 static const a32::Register GetHostReg32(HostReg reg)
    355 {
    356   return a32::Register(reg);
    357 }
    358 
    359 static const a32::Register GetHostReg32(const Value& value)
    360 {
    361   DebugAssert(value.size == RegSize_32 && value.IsInHostRegister());
    362   return a32::Register(value.host_reg);
    363 }
    364 
    365 static const a32::Register GetCPUPtrReg()
    366 {
    367   return GetHostReg32(RCPUPTR);
    368 }
    369 
    370 static const a32::Register GetFastmemBasePtrReg()
    371 {
    372   return GetHostReg32(RMEMBASEPTR);
    373 }
    374 
    375 CodeGenerator::CodeGenerator()
    376   : m_register_cache(*this), m_near_emitter(static_cast<vixl::byte*>(CPU::CodeCache::GetFreeCodePointer()),
    377                                             CPU::CodeCache::GetFreeCodeSpace(), a32::A32),
    378     m_far_emitter(static_cast<vixl::byte*>(CPU::CodeCache::GetFreeFarCodePointer()),
    379                   CPU::CodeCache::GetFreeFarCodeSpace(), a32::A32),
    380     m_emit(&m_near_emitter)
    381 {
    382   InitHostRegs();
    383 }
    384 
    385 CodeGenerator::~CodeGenerator() = default;
    386 
    387 const char* CodeGenerator::GetHostRegName(HostReg reg, RegSize size /*= HostPointerSize*/)
    388 {
    389   static constexpr std::array<const char*, HostReg_Count> reg_names = {
    390     {"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"}};
    391   if (reg >= static_cast<HostReg>(HostReg_Count))
    392     return "";
    393 
    394   switch (size)
    395   {
    396     case RegSize_32:
    397       return reg_names[reg];
    398     default:
    399       return "";
    400   }
    401 }
    402 
    403 void CodeGenerator::InitHostRegs()
    404 {
    405   // allocate nonvolatile before volatile
    406   // NOTE: vixl also uses r12 for the macro assembler
    407   m_register_cache.SetHostRegAllocationOrder({4, 5, 6, 7, 8, 9, 10, 11});
    408   m_register_cache.SetCallerSavedHostRegs({0, 1, 2, 3, 12});
    409   m_register_cache.SetCalleeSavedHostRegs({4, 5, 6, 7, 8, 9, 10, 11, 13, 14});
    410   m_register_cache.SetCPUPtrHostReg(RCPUPTR);
    411 }
    412 
    413 void CodeGenerator::SwitchToFarCode()
    414 {
    415   m_emit = &m_far_emitter;
    416 }
    417 
    418 void CodeGenerator::SwitchToNearCode()
    419 {
    420   m_emit = &m_near_emitter;
    421 }
    422 
    423 void* CodeGenerator::GetStartNearCodePointer() const
    424 {
    425   return static_cast<u8*>(CPU::CodeCache::GetFreeCodePointer());
    426 }
    427 
    428 void* CodeGenerator::GetCurrentNearCodePointer() const
    429 {
    430   return static_cast<u8*>(CPU::CodeCache::GetFreeCodePointer()) + m_near_emitter.GetCursorOffset();
    431 }
    432 
    433 void* CodeGenerator::GetCurrentFarCodePointer() const
    434 {
    435   return static_cast<u8*>(CPU::CodeCache::GetFreeFarCodePointer()) + m_far_emitter.GetCursorOffset();
    436 }
    437 
    438 Value CodeGenerator::GetValueInHostRegister(const Value& value, bool allow_zero_register /* = true */)
    439 {
    440   if (value.IsInHostRegister())
    441     return Value::FromHostReg(&m_register_cache, value.host_reg, value.size);
    442 
    443   Value new_value = m_register_cache.AllocateScratch(value.size);
    444   EmitCopyValue(new_value.host_reg, value);
    445   return new_value;
    446 }
    447 
    448 Value CodeGenerator::GetValueInHostOrScratchRegister(const Value& value, bool allow_zero_register /* = true */)
    449 {
    450   if (value.IsInHostRegister())
    451     return Value::FromHostReg(&m_register_cache, value.host_reg, value.size);
    452 
    453   Value new_value = Value::FromHostReg(&m_register_cache, RSCRATCH, value.size);
    454   EmitCopyValue(new_value.host_reg, value);
    455   return new_value;
    456 }
    457 
    458 void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
    459 {
    460   if (allocate_registers)
    461   {
    462     // Save the link register, since we'll be calling functions.
    463     const bool link_reg_allocated = m_register_cache.AllocateHostReg(14);
    464     DebugAssert(link_reg_allocated);
    465     UNREFERENCED_VARIABLE(link_reg_allocated);
    466     m_register_cache.AssumeCalleeSavedRegistersAreSaved();
    467 
    468     // Store the CPU struct pointer. TODO: make this better.
    469     const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR);
    470     // m_emit->Mov(GetCPUPtrReg(), reinterpret_cast<uintptr_t>(&g_state));
    471     DebugAssert(cpu_reg_allocated);
    472     UNREFERENCED_VARIABLE(cpu_reg_allocated);
    473   }
    474 }
    475 
    476 void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, const void* jump_to)
    477 {
    478   if (free_registers)
    479   {
    480     m_register_cache.FreeHostReg(RCPUPTR);
    481     m_register_cache.FreeHostReg(14);
    482     m_register_cache.PopCalleeSavedRegisters(true);
    483   }
    484 
    485   if (jump_to)
    486     armEmitJmp(m_emit, jump_to, true);
    487 }
    488 
    489 void CodeGenerator::EmitExceptionExit()
    490 {
    491   // ensure all unflushed registers are written back
    492   m_register_cache.FlushAllGuestRegisters(false, false);
    493 
    494   // the interpreter load delay might have its own value, but we'll overwrite it here anyway
    495   // technically RaiseException() and FlushPipeline() have already been called, but that should be okay
    496   m_register_cache.FlushLoadDelay(false);
    497 
    498   m_register_cache.PopCalleeSavedRegisters(false);
    499 
    500   armEmitJmp(m_emit, CodeCache::g_check_events_and_dispatch, true);
    501 }
    502 
    503 void CodeGenerator::EmitExceptionExitOnBool(const Value& value)
    504 {
    505   Assert(!value.IsConstant() && value.IsInHostRegister());
    506 
    507   m_register_cache.PushState();
    508 
    509   // TODO: This is... not great.
    510   a32::Label skip_branch;
    511   m_emit->tst(GetHostReg32(value.host_reg), 1);
    512   m_emit->b(a32::eq, &skip_branch);
    513   EmitBranch(GetCurrentFarCodePointer());
    514   m_emit->Bind(&skip_branch);
    515 
    516   SwitchToFarCode();
    517   EmitExceptionExit();
    518   SwitchToNearCode();
    519 
    520   m_register_cache.PopState();
    521 }
    522 
    523 const void* CodeGenerator::FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size)
    524 {
    525   m_near_emitter.FinalizeCode();
    526   m_far_emitter.FinalizeCode();
    527 
    528   const void* code = CPU::CodeCache::GetFreeCodePointer();
    529   *out_host_code_size = static_cast<u32>(m_near_emitter.GetSizeOfCodeGenerated());
    530   *out_host_far_code_size = static_cast<u32>(m_far_emitter.GetSizeOfCodeGenerated());
    531 
    532   CPU::CodeCache::CommitCode(static_cast<u32>(m_near_emitter.GetSizeOfCodeGenerated()));
    533   CPU::CodeCache::CommitFarCode(static_cast<u32>(m_far_emitter.GetSizeOfCodeGenerated()));
    534 
    535   m_near_emitter = CodeEmitter(static_cast<vixl::byte*>(CPU::CodeCache::GetFreeCodePointer()),
    536                                CPU::CodeCache::GetFreeCodeSpace(), a32::A32);
    537   m_far_emitter = CodeEmitter(static_cast<vixl::byte*>(CPU::CodeCache::GetFreeFarCodePointer()),
    538                               CPU::CodeCache::GetFreeFarCodeSpace(), a32::A32);
    539 
    540   return code;
    541 }
    542 
    543 void CodeGenerator::EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size)
    544 {
    545   switch (to_size)
    546   {
    547     case RegSize_16:
    548     {
    549       switch (from_size)
    550       {
    551         case RegSize_8:
    552           m_emit->sxtb(GetHostReg16(to_reg), GetHostReg8(from_reg));
    553           m_emit->and_(GetHostReg16(to_reg), GetHostReg16(to_reg), 0xFFFF);
    554           return;
    555       }
    556     }
    557     break;
    558 
    559     case RegSize_32:
    560     {
    561       switch (from_size)
    562       {
    563         case RegSize_8:
    564           m_emit->sxtb(GetHostReg32(to_reg), GetHostReg8(from_reg));
    565           return;
    566         case RegSize_16:
    567           m_emit->sxth(GetHostReg32(to_reg), GetHostReg16(from_reg));
    568           return;
    569       }
    570     }
    571     break;
    572   }
    573 
    574   Panic("Unknown sign-extend combination");
    575 }
    576 
    577 void CodeGenerator::EmitZeroExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size)
    578 {
    579   switch (to_size)
    580   {
    581     case RegSize_16:
    582     {
    583       switch (from_size)
    584       {
    585         case RegSize_8:
    586           m_emit->and_(GetHostReg16(to_reg), GetHostReg8(from_reg), 0xFF);
    587           return;
    588       }
    589     }
    590     break;
    591 
    592     case RegSize_32:
    593     {
    594       switch (from_size)
    595       {
    596         case RegSize_8:
    597           m_emit->and_(GetHostReg32(to_reg), GetHostReg8(from_reg), 0xFF);
    598           return;
    599         case RegSize_16:
    600           m_emit->and_(GetHostReg32(to_reg), GetHostReg16(from_reg), 0xFFFF);
    601           return;
    602       }
    603     }
    604     break;
    605   }
    606 
    607   Panic("Unknown sign-extend combination");
    608 }
    609 
    610 void CodeGenerator::EmitCopyValue(HostReg to_reg, const Value& value)
    611 {
    612   // TODO: mov x, 0 -> xor x, x
    613   DebugAssert(value.IsConstant() || value.IsInHostRegister());
    614 
    615   switch (value.size)
    616   {
    617     case RegSize_8:
    618     case RegSize_16:
    619     case RegSize_32:
    620     {
    621       if (value.IsConstant())
    622         m_emit->Mov(GetHostReg32(to_reg), value.GetS32ConstantValue());
    623       else
    624         m_emit->Mov(GetHostReg32(to_reg), GetHostReg32(value.host_reg));
    625     }
    626     break;
    627 
    628     default:
    629       UnreachableCode();
    630       break;
    631   }
    632 }
    633 
    634 void CodeGenerator::EmitAdd(HostReg to_reg, HostReg from_reg, const Value& value, bool set_flags)
    635 {
    636   Assert(value.IsConstant() || value.IsInHostRegister());
    637 
    638   // if it's in a host register already, this is easy
    639   if (value.IsInHostRegister())
    640   {
    641     if (set_flags)
    642       m_emit->adds(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(value.host_reg));
    643     else
    644       m_emit->add(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(value.host_reg));
    645 
    646     return;
    647   }
    648 
    649   // do we need temporary storage for the constant, if it won't fit in an immediate?
    650   const s32 constant_value = value.GetS32ConstantValue();
    651   if (a32::ImmediateA32::IsImmediateA32(static_cast<u32>(constant_value)))
    652   {
    653     if (set_flags)
    654       m_emit->adds(GetHostReg32(to_reg), GetHostReg32(from_reg), constant_value);
    655     else
    656       m_emit->add(GetHostReg32(to_reg), GetHostReg32(from_reg), constant_value);
    657 
    658     return;
    659   }
    660 
    661   // need a temporary
    662   m_emit->Mov(GetHostReg32(RSCRATCH), constant_value);
    663   if (set_flags)
    664     m_emit->adds(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    665   else
    666     m_emit->add(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    667 }
    668 
    669 void CodeGenerator::EmitSub(HostReg to_reg, HostReg from_reg, const Value& value, bool set_flags)
    670 {
    671   Assert(value.IsConstant() || value.IsInHostRegister());
    672 
    673   // if it's in a host register already, this is easy
    674   if (value.IsInHostRegister())
    675   {
    676     if (set_flags)
    677       m_emit->subs(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(value.host_reg));
    678     else
    679       m_emit->sub(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(value.host_reg));
    680 
    681     return;
    682   }
    683 
    684   // do we need temporary storage for the constant, if it won't fit in an immediate?
    685   const s32 constant_value = value.GetS32ConstantValue();
    686   if (a32::ImmediateA32::IsImmediateA32(static_cast<u32>(constant_value)))
    687   {
    688     if (set_flags)
    689       m_emit->subs(GetHostReg32(to_reg), GetHostReg32(from_reg), constant_value);
    690     else
    691       m_emit->sub(GetHostReg32(to_reg), GetHostReg32(from_reg), constant_value);
    692 
    693     return;
    694   }
    695 
    696   // need a temporary
    697   m_emit->Mov(GetHostReg32(RSCRATCH), constant_value);
    698   if (set_flags)
    699     m_emit->subs(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    700   else
    701     m_emit->sub(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    702 }
    703 
    704 void CodeGenerator::EmitCmp(HostReg to_reg, const Value& value)
    705 {
    706   Assert(value.IsConstant() || value.IsInHostRegister());
    707 
    708   // if it's in a host register already, this is easy
    709   if (value.IsInHostRegister())
    710   {
    711     m_emit->cmp(GetHostReg32(to_reg), GetHostReg32(value.host_reg));
    712     return;
    713   }
    714 
    715   // do we need temporary storage for the constant, if it won't fit in an immediate?
    716   const s32 constant_value = value.GetS32ConstantValue();
    717   if (constant_value >= 0)
    718   {
    719     if (a32::ImmediateA32::IsImmediateA32(static_cast<u32>(constant_value)))
    720     {
    721       m_emit->cmp(GetHostReg32(to_reg), constant_value);
    722       return;
    723     }
    724   }
    725   else
    726   {
    727     if (a32::ImmediateA32::IsImmediateA32(static_cast<u32>(-constant_value)))
    728     {
    729       m_emit->cmn(GetHostReg32(to_reg), -constant_value);
    730       return;
    731     }
    732   }
    733 
    734   // need a temporary
    735   m_emit->Mov(GetHostReg32(RSCRATCH), constant_value);
    736   m_emit->cmp(GetHostReg32(to_reg), GetHostReg32(RSCRATCH));
    737 }
    738 
    739 void CodeGenerator::EmitMul(HostReg to_reg_hi, HostReg to_reg_lo, const Value& lhs, const Value& rhs,
    740                             bool signed_multiply)
    741 {
    742   // We could use GetValueInHostRegister() here, but we run out of registers...
    743   // Value lhs_in_reg = GetValueInHostRegister(lhs);
    744   // Value rhs_in_reg = GetValueInHostRegister(rhs);
    745   const HostReg lhs_in_reg = lhs.IsInHostRegister() ? lhs.GetHostRegister() : (EmitCopyValue(RARG1, lhs), RARG1);
    746   const HostReg rhs_in_reg = rhs.IsInHostRegister() ? rhs.GetHostRegister() : (EmitCopyValue(RARG2, rhs), RARG2);
    747 
    748   if (lhs.size < RegSize_64)
    749   {
    750     if (signed_multiply)
    751     {
    752       m_emit->smull(GetHostReg32(to_reg_lo), GetHostReg32(to_reg_hi), GetHostReg32(lhs_in_reg),
    753                     GetHostReg32(rhs_in_reg));
    754     }
    755     else
    756     {
    757       m_emit->umull(GetHostReg32(to_reg_lo), GetHostReg32(to_reg_hi), GetHostReg32(lhs_in_reg),
    758                     GetHostReg32(rhs_in_reg));
    759     }
    760   }
    761   else
    762   {
    763     // TODO: Use mul + smulh
    764     Panic("Not implemented");
    765   }
    766 }
    767 
    768 void CodeGenerator::EmitDiv(HostReg to_reg_quotient, HostReg to_reg_remainder, HostReg num, HostReg denom, RegSize size,
    769                             bool signed_divide)
    770 {
    771   // only 32-bit supported for now..
    772   Assert(size == RegSize_32);
    773 
    774   Value quotient_value;
    775   if (to_reg_quotient == HostReg_Count)
    776     quotient_value.SetHostReg(&m_register_cache, RSCRATCH, size);
    777   else
    778     quotient_value.SetHostReg(&m_register_cache, to_reg_quotient, size);
    779 
    780   if (signed_divide)
    781   {
    782     m_emit->sdiv(GetHostReg32(quotient_value), GetHostReg32(num), GetHostReg32(denom));
    783     if (to_reg_remainder != HostReg_Count)
    784     {
    785       m_emit->mul(GetHostReg32(to_reg_remainder), GetHostReg32(quotient_value), GetHostReg32(denom));
    786       m_emit->sub(GetHostReg32(to_reg_remainder), GetHostReg32(num), GetHostReg32(to_reg_remainder));
    787     }
    788   }
    789   else
    790   {
    791     m_emit->udiv(GetHostReg32(quotient_value), GetHostReg32(num), GetHostReg32(denom));
    792     if (to_reg_remainder != HostReg_Count)
    793     {
    794       m_emit->mul(GetHostReg32(to_reg_remainder), GetHostReg32(quotient_value), GetHostReg32(denom));
    795       m_emit->sub(GetHostReg32(to_reg_remainder), GetHostReg32(num), GetHostReg32(to_reg_remainder));
    796     }
    797   }
    798 }
    799 
    800 void CodeGenerator::EmitInc(HostReg to_reg, RegSize size)
    801 {
    802   Panic("Not implemented");
    803 #if 0
    804   switch (size)
    805   {
    806     case RegSize_8:
    807       m_emit->inc(GetHostReg8(to_reg));
    808       break;
    809     case RegSize_16:
    810       m_emit->inc(GetHostReg16(to_reg));
    811       break;
    812     case RegSize_32:
    813       m_emit->inc(GetHostReg32(to_reg));
    814       break;
    815     default:
    816       UnreachableCode();
    817       break;
    818   }
    819 #endif
    820 }
    821 
    822 void CodeGenerator::EmitDec(HostReg to_reg, RegSize size)
    823 {
    824   Panic("Not implemented");
    825 #if 0
    826   switch (size)
    827   {
    828     case RegSize_8:
    829       m_emit->dec(GetHostReg8(to_reg));
    830       break;
    831     case RegSize_16:
    832       m_emit->dec(GetHostReg16(to_reg));
    833       break;
    834     case RegSize_32:
    835       m_emit->dec(GetHostReg32(to_reg));
    836       break;
    837     default:
    838       UnreachableCode();
    839       break;
    840   }
    841 #endif
    842 }
    843 
    844 void CodeGenerator::EmitShl(HostReg to_reg, HostReg from_reg, RegSize size, const Value& amount_value,
    845                             bool assume_amount_masked)
    846 {
    847   switch (size)
    848   {
    849     case RegSize_8:
    850     case RegSize_16:
    851     case RegSize_32:
    852     {
    853       if (amount_value.IsConstant())
    854       {
    855         m_emit->lsl(GetHostReg32(to_reg), GetHostReg32(from_reg), static_cast<u32>(amount_value.constant_value & 0x1F));
    856       }
    857       else if (assume_amount_masked)
    858       {
    859         m_emit->lsl(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(amount_value));
    860       }
    861       else
    862       {
    863         m_emit->and_(GetHostReg32(RSCRATCH), GetHostReg32(amount_value), 0x1F);
    864         m_emit->lsl(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    865       }
    866 
    867       if (size == RegSize_8)
    868         m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), 0xFF);
    869       else if (size == RegSize_16)
    870         m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), 0xFFFF);
    871     }
    872     break;
    873   }
    874 }
    875 
    876 void CodeGenerator::EmitShr(HostReg to_reg, HostReg from_reg, RegSize size, const Value& amount_value,
    877                             bool assume_amount_masked)
    878 {
    879   switch (size)
    880   {
    881     case RegSize_8:
    882     case RegSize_16:
    883     case RegSize_32:
    884     {
    885       if (amount_value.IsConstant())
    886       {
    887         m_emit->lsr(GetHostReg32(to_reg), GetHostReg32(from_reg), static_cast<u32>(amount_value.constant_value & 0x1F));
    888       }
    889       else if (assume_amount_masked)
    890       {
    891         m_emit->lsr(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(amount_value));
    892       }
    893       else
    894       {
    895         m_emit->and_(GetHostReg32(RSCRATCH), GetHostReg32(amount_value), 0x1F);
    896         m_emit->lsr(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    897       }
    898 
    899       if (size == RegSize_8)
    900         m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), 0xFF);
    901       else if (size == RegSize_16)
    902         m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), 0xFFFF);
    903     }
    904     break;
    905   }
    906 }
    907 
    908 void CodeGenerator::EmitSar(HostReg to_reg, HostReg from_reg, RegSize size, const Value& amount_value,
    909                             bool assume_amount_masked)
    910 {
    911   switch (size)
    912   {
    913     case RegSize_8:
    914     case RegSize_16:
    915     case RegSize_32:
    916     {
    917       if (amount_value.IsConstant())
    918       {
    919         m_emit->asr(GetHostReg32(to_reg), GetHostReg32(from_reg), static_cast<u32>(amount_value.constant_value & 0x1F));
    920       }
    921       else if (assume_amount_masked)
    922       {
    923         m_emit->asr(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(amount_value));
    924       }
    925       else
    926       {
    927         m_emit->and_(GetHostReg32(RSCRATCH), GetHostReg32(amount_value), 0x1F);
    928         m_emit->asr(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    929       }
    930 
    931       if (size == RegSize_8)
    932         m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), 0xFF);
    933       else if (size == RegSize_16)
    934         m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), 0xFFFF);
    935     }
    936     break;
    937   }
    938 }
    939 
    940 static bool CanFitInBitwiseImmediate(const Value& value)
    941 {
    942   return a32::ImmediateA32::IsImmediateA32(static_cast<u32>(value.constant_value));
    943 }
    944 
    945 void CodeGenerator::EmitAnd(HostReg to_reg, HostReg from_reg, const Value& value)
    946 {
    947   Assert(value.IsConstant() || value.IsInHostRegister());
    948 
    949   // if it's in a host register already, this is easy
    950   if (value.IsInHostRegister())
    951   {
    952     m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(value.host_reg));
    953     return;
    954   }
    955 
    956   // do we need temporary storage for the constant, if it won't fit in an immediate?
    957   if (CanFitInBitwiseImmediate(value))
    958   {
    959     m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), s32(value.constant_value));
    960     return;
    961   }
    962 
    963   // need a temporary
    964   m_emit->Mov(GetHostReg32(RSCRATCH), s32(value.constant_value));
    965   m_emit->and_(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    966 }
    967 
    968 void CodeGenerator::EmitOr(HostReg to_reg, HostReg from_reg, const Value& value)
    969 {
    970   Assert(value.IsConstant() || value.IsInHostRegister());
    971 
    972   // if it's in a host register already, this is easy
    973   if (value.IsInHostRegister())
    974   {
    975     m_emit->orr(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(value.host_reg));
    976     return;
    977   }
    978 
    979   // do we need temporary storage for the constant, if it won't fit in an immediate?
    980   if (CanFitInBitwiseImmediate(value))
    981   {
    982     m_emit->orr(GetHostReg32(to_reg), GetHostReg32(from_reg), s32(value.constant_value));
    983     return;
    984   }
    985 
    986   // need a temporary
    987   m_emit->Mov(GetHostReg32(RSCRATCH), s32(value.constant_value));
    988   m_emit->orr(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
    989 }
    990 
    991 void CodeGenerator::EmitXor(HostReg to_reg, HostReg from_reg, const Value& value)
    992 {
    993   Assert(value.IsConstant() || value.IsInHostRegister());
    994 
    995   // if it's in a host register already, this is easy
    996   if (value.IsInHostRegister())
    997   {
    998     m_emit->eor(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(value.host_reg));
    999     return;
   1000   }
   1001 
   1002   // do we need temporary storage for the constant, if it won't fit in an immediate?
   1003   if (CanFitInBitwiseImmediate(value))
   1004   {
   1005     m_emit->eor(GetHostReg32(to_reg), GetHostReg32(from_reg), s32(value.constant_value));
   1006     return;
   1007   }
   1008 
   1009   // need a temporary
   1010   m_emit->Mov(GetHostReg32(RSCRATCH), s32(value.constant_value));
   1011   m_emit->eor(GetHostReg32(to_reg), GetHostReg32(from_reg), GetHostReg32(RSCRATCH));
   1012 }
   1013 
   1014 void CodeGenerator::EmitTest(HostReg to_reg, const Value& value)
   1015 {
   1016   Assert(value.IsConstant() || value.IsInHostRegister());
   1017 
   1018   // if it's in a host register already, this is easy
   1019   if (value.IsInHostRegister())
   1020   {
   1021     m_emit->tst(GetHostReg32(to_reg), GetHostReg32(value.host_reg));
   1022     return;
   1023   }
   1024 
   1025   // do we need temporary storage for the constant, if it won't fit in an immediate?
   1026   if (CanFitInBitwiseImmediate(value))
   1027   {
   1028     m_emit->tst(GetHostReg32(to_reg), s32(value.constant_value));
   1029     return;
   1030   }
   1031 
   1032   // need a temporary
   1033   m_emit->Mov(GetHostReg32(RSCRATCH), s32(value.constant_value));
   1034   m_emit->tst(GetHostReg32(to_reg), GetHostReg32(RSCRATCH));
   1035 }
   1036 
   1037 void CodeGenerator::EmitNot(HostReg to_reg, RegSize size)
   1038 {
   1039   switch (size)
   1040   {
   1041     case RegSize_8:
   1042       m_emit->mvn(GetHostReg8(to_reg), GetHostReg8(to_reg));
   1043       m_emit->and_(GetHostReg8(to_reg), GetHostReg8(to_reg), 0xFF);
   1044       break;
   1045 
   1046     case RegSize_16:
   1047       m_emit->mvn(GetHostReg16(to_reg), GetHostReg16(to_reg));
   1048       m_emit->and_(GetHostReg16(to_reg), GetHostReg16(to_reg), 0xFFFF);
   1049       break;
   1050 
   1051     case RegSize_32:
   1052       m_emit->mvn(GetHostReg32(to_reg), GetHostReg32(to_reg));
   1053       break;
   1054 
   1055     default:
   1056       break;
   1057   }
   1058 }
   1059 
   1060 void CodeGenerator::EmitSetConditionResult(HostReg to_reg, RegSize to_size, Condition condition)
   1061 {
   1062   if (condition == Condition::Always)
   1063   {
   1064     m_emit->Mov(GetHostReg32(to_reg), 1);
   1065     return;
   1066   }
   1067 
   1068   a32::Condition acond(a32::Condition::Never());
   1069   switch (condition)
   1070   {
   1071     case Condition::NotEqual:
   1072       acond = a32::ne;
   1073       break;
   1074 
   1075     case Condition::Equal:
   1076       acond = a32::eq;
   1077       break;
   1078 
   1079     case Condition::Overflow:
   1080       acond = a32::vs;
   1081       break;
   1082 
   1083     case Condition::Greater:
   1084       acond = a32::gt;
   1085       break;
   1086 
   1087     case Condition::GreaterEqual:
   1088       acond = a32::ge;
   1089       break;
   1090 
   1091     case Condition::Less:
   1092       acond = a32::lt;
   1093       break;
   1094 
   1095     case Condition::LessEqual:
   1096       acond = a32::le;
   1097       break;
   1098 
   1099     case Condition::Negative:
   1100       acond = a32::mi;
   1101       break;
   1102 
   1103     case Condition::PositiveOrZero:
   1104       acond = a32::pl;
   1105       break;
   1106 
   1107     case Condition::Above:
   1108       acond = a32::hi;
   1109       break;
   1110 
   1111     case Condition::AboveEqual:
   1112       acond = a32::cs;
   1113       break;
   1114 
   1115     case Condition::Below:
   1116       acond = a32::cc;
   1117       break;
   1118 
   1119     case Condition::BelowEqual:
   1120       acond = a32::ls;
   1121       break;
   1122 
   1123     default:
   1124       UnreachableCode();
   1125       return;
   1126   }
   1127 
   1128   m_emit->mov(GetHostReg32(to_reg), 0);
   1129   m_emit->mov(acond, GetHostReg32(to_reg), 1);
   1130 }
   1131 
   1132 u32 CodeGenerator::PrepareStackForCall()
   1133 {
   1134   m_register_cache.PushCallerSavedRegisters();
   1135   m_membase_loaded = false;
   1136   return 0;
   1137 }
   1138 
   1139 void CodeGenerator::RestoreStackAfterCall(u32 adjust_size)
   1140 {
   1141   m_register_cache.PopCallerSavedRegisters();
   1142 }
   1143 
   1144 void CodeGenerator::EmitCall(const void* ptr)
   1145 {
   1146   armEmitCall(m_emit, ptr, false);
   1147 }
   1148 
   1149 void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr)
   1150 {
   1151   if (return_value)
   1152     return_value->Discard();
   1153 
   1154   // shadow space allocate
   1155   const u32 adjust_size = PrepareStackForCall();
   1156 
   1157   // actually call the function
   1158   EmitCall(ptr);
   1159 
   1160   // shadow space release
   1161   RestoreStackAfterCall(adjust_size);
   1162 
   1163   // copy out return value if requested
   1164   if (return_value)
   1165   {
   1166     return_value->Undiscard();
   1167     EmitCopyValue(return_value->GetHostRegister(), Value::FromHostReg(&m_register_cache, RRETURN, return_value->size));
   1168   }
   1169 }
   1170 
   1171 void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, const Value& arg1)
   1172 {
   1173   if (return_value)
   1174     return_value->Discard();
   1175 
   1176   // shadow space allocate
   1177   const u32 adjust_size = PrepareStackForCall();
   1178 
   1179   // push arguments
   1180   EmitCopyValue(RARG1, arg1);
   1181 
   1182   // actually call the function
   1183   EmitCall(ptr);
   1184 
   1185   // shadow space release
   1186   RestoreStackAfterCall(adjust_size);
   1187 
   1188   // copy out return value if requested
   1189   if (return_value)
   1190   {
   1191     return_value->Undiscard();
   1192     EmitCopyValue(return_value->GetHostRegister(), Value::FromHostReg(&m_register_cache, RRETURN, return_value->size));
   1193   }
   1194 }
   1195 
   1196 void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, const Value& arg1, const Value& arg2)
   1197 {
   1198   if (return_value)
   1199     return_value->Discard();
   1200 
   1201   // shadow space allocate
   1202   const u32 adjust_size = PrepareStackForCall();
   1203 
   1204   // push arguments
   1205   EmitCopyValue(RARG1, arg1);
   1206   EmitCopyValue(RARG2, arg2);
   1207 
   1208   // actually call the function
   1209   EmitCall(ptr);
   1210 
   1211   // shadow space release
   1212   RestoreStackAfterCall(adjust_size);
   1213 
   1214   // copy out return value if requested
   1215   if (return_value)
   1216   {
   1217     return_value->Undiscard();
   1218     EmitCopyValue(return_value->GetHostRegister(), Value::FromHostReg(&m_register_cache, RRETURN, return_value->size));
   1219   }
   1220 }
   1221 
   1222 void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, const Value& arg1, const Value& arg2,
   1223                                         const Value& arg3)
   1224 {
   1225   if (return_value)
   1226     m_register_cache.DiscardHostReg(return_value->GetHostRegister());
   1227 
   1228   // shadow space allocate
   1229   const u32 adjust_size = PrepareStackForCall();
   1230 
   1231   // push arguments
   1232   EmitCopyValue(RARG1, arg1);
   1233   EmitCopyValue(RARG2, arg2);
   1234   EmitCopyValue(RARG3, arg3);
   1235 
   1236   // actually call the function
   1237   EmitCall(ptr);
   1238 
   1239   // shadow space release
   1240   RestoreStackAfterCall(adjust_size);
   1241 
   1242   // copy out return value if requested
   1243   if (return_value)
   1244   {
   1245     return_value->Undiscard();
   1246     EmitCopyValue(return_value->GetHostRegister(), Value::FromHostReg(&m_register_cache, RRETURN, return_value->size));
   1247   }
   1248 }
   1249 
   1250 void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, const Value& arg1, const Value& arg2,
   1251                                         const Value& arg3, const Value& arg4)
   1252 {
   1253   if (return_value)
   1254     return_value->Discard();
   1255 
   1256   // shadow space allocate
   1257   const u32 adjust_size = PrepareStackForCall();
   1258 
   1259   // push arguments
   1260   EmitCopyValue(RARG1, arg1);
   1261   EmitCopyValue(RARG2, arg2);
   1262   EmitCopyValue(RARG3, arg3);
   1263   EmitCopyValue(RARG4, arg4);
   1264 
   1265   // actually call the function
   1266   EmitCall(ptr);
   1267 
   1268   // shadow space release
   1269   RestoreStackAfterCall(adjust_size);
   1270 
   1271   // copy out return value if requested
   1272   if (return_value)
   1273   {
   1274     return_value->Undiscard();
   1275     EmitCopyValue(return_value->GetHostRegister(), Value::FromHostReg(&m_register_cache, RRETURN, return_value->size));
   1276   }
   1277 }
   1278 
   1279 void CodeGenerator::EmitPushHostReg(HostReg reg, u32 position)
   1280 {
   1281   const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - (position * 4));
   1282   m_emit->str(GetHostReg32(reg), addr);
   1283 }
   1284 
   1285 void CodeGenerator::EmitPushHostRegPair(HostReg reg, HostReg reg2, u32 position)
   1286 {
   1287   // TODO: Use stm?
   1288   EmitPushHostReg(reg, position);
   1289   EmitPushHostReg(reg2, position + 1);
   1290 }
   1291 
   1292 void CodeGenerator::EmitPopHostReg(HostReg reg, u32 position)
   1293 {
   1294   const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - (position * 4));
   1295   m_emit->ldr(GetHostReg32(reg), addr);
   1296 }
   1297 
   1298 void CodeGenerator::EmitPopHostRegPair(HostReg reg, HostReg reg2, u32 position)
   1299 {
   1300   // TODO: Use ldm?
   1301   Assert(position > 0);
   1302   EmitPopHostReg(reg2, position);
   1303   EmitPopHostReg(reg, position - 1);
   1304 }
   1305 
   1306 void CodeGenerator::EmitLoadCPUStructField(HostReg host_reg, RegSize guest_size, u32 offset)
   1307 {
   1308   const s32 s_offset = static_cast<s32>(offset);
   1309 
   1310   switch (guest_size)
   1311   {
   1312     case RegSize_8:
   1313       m_emit->ldrb(GetHostReg8(host_reg), a32::MemOperand(GetCPUPtrReg(), s_offset));
   1314       break;
   1315 
   1316     case RegSize_16:
   1317       m_emit->ldrh(GetHostReg16(host_reg), a32::MemOperand(GetCPUPtrReg(), s_offset));
   1318       break;
   1319 
   1320     case RegSize_32:
   1321       m_emit->ldr(GetHostReg32(host_reg), a32::MemOperand(GetCPUPtrReg(), s_offset));
   1322       break;
   1323 
   1324     default:
   1325     {
   1326       UnreachableCode();
   1327     }
   1328     break;
   1329   }
   1330 }
   1331 
   1332 void CodeGenerator::EmitStoreCPUStructField(u32 offset, const Value& value)
   1333 {
   1334   const Value hr_value = GetValueInHostOrScratchRegister(value);
   1335   const s32 s_offset = static_cast<s32>(offset);
   1336 
   1337   switch (value.size)
   1338   {
   1339     case RegSize_8:
   1340       m_emit->strb(GetHostReg8(hr_value), a32::MemOperand(GetCPUPtrReg(), s_offset));
   1341       break;
   1342 
   1343     case RegSize_16:
   1344       m_emit->strh(GetHostReg16(hr_value), a32::MemOperand(GetCPUPtrReg(), s_offset));
   1345       break;
   1346 
   1347     case RegSize_32:
   1348       m_emit->str(GetHostReg32(hr_value), a32::MemOperand(GetCPUPtrReg(), s_offset));
   1349       break;
   1350 
   1351     default:
   1352     {
   1353       UnreachableCode();
   1354     }
   1355     break;
   1356   }
   1357 }
   1358 
   1359 void CodeGenerator::EmitAddCPUStructField(u32 offset, const Value& value)
   1360 {
   1361   const s32 s_offset = static_cast<s32>(offset);
   1362   const a32::MemOperand o_offset(GetCPUPtrReg(), s_offset);
   1363 
   1364   Value real_value;
   1365   if (value.IsInHostRegister())
   1366   {
   1367     real_value.SetHostReg(&m_register_cache, value.host_reg, value.size);
   1368   }
   1369   else
   1370   {
   1371     // do we need temporary storage for the constant, if it won't fit in an immediate?
   1372     Assert(value.IsConstant());
   1373     const s32 constant_value = value.GetS32ConstantValue();
   1374     if (!a32::ImmediateA32::IsImmediateA32(static_cast<u32>(constant_value)))
   1375     {
   1376       real_value.SetHostReg(&m_register_cache, RARG2, value.size);
   1377       EmitCopyValue(real_value.host_reg, value);
   1378     }
   1379     else
   1380     {
   1381       real_value = value;
   1382     }
   1383   }
   1384 
   1385   // Don't need to mask here because we're storing back to memory.
   1386   switch (value.size)
   1387   {
   1388     case RegSize_8:
   1389     {
   1390       m_emit->Ldrb(GetHostReg8(RARG1), o_offset);
   1391       if (real_value.IsConstant())
   1392         m_emit->Add(GetHostReg8(RARG1), GetHostReg8(RARG1), real_value.GetS32ConstantValue());
   1393       else
   1394         m_emit->Add(GetHostReg8(RARG1), GetHostReg8(RARG1), GetHostReg8(real_value));
   1395       m_emit->Strb(GetHostReg8(RARG1), o_offset);
   1396     }
   1397     break;
   1398 
   1399     case RegSize_16:
   1400     {
   1401       m_emit->Ldrh(GetHostReg16(RARG1), o_offset);
   1402       if (real_value.IsConstant())
   1403         m_emit->Add(GetHostReg16(RARG1), GetHostReg16(RARG1), real_value.GetS32ConstantValue());
   1404       else
   1405         m_emit->Add(GetHostReg16(RARG1), GetHostReg16(RARG1), GetHostReg16(real_value));
   1406       m_emit->Strh(GetHostReg16(RARG1), o_offset);
   1407     }
   1408     break;
   1409 
   1410     case RegSize_32:
   1411     {
   1412       m_emit->Ldr(GetHostReg32(RARG1), o_offset);
   1413       if (real_value.IsConstant())
   1414         m_emit->Add(GetHostReg32(RARG1), GetHostReg32(RARG1), real_value.GetS32ConstantValue());
   1415       else
   1416         m_emit->Add(GetHostReg32(RARG1), GetHostReg32(RARG1), GetHostReg32(real_value));
   1417       m_emit->Str(GetHostReg32(RARG1), o_offset);
   1418     }
   1419     break;
   1420 
   1421     default:
   1422     {
   1423       UnreachableCode();
   1424     }
   1425     break;
   1426   }
   1427 }
   1428 
   1429 void CodeGenerator::EnsureMembaseLoaded()
   1430 {
   1431   if (m_membase_loaded)
   1432     return;
   1433 
   1434   m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, fastmem_base)));
   1435   m_membase_loaded = true;
   1436 }
   1437 
   1438 void CodeGenerator::EmitUpdateFastmemBase()
   1439 {
   1440   m_membase_loaded = false;
   1441 }
   1442 
   1443 void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, Value& result)
   1444 {
   1445   EnsureMembaseLoaded();
   1446 
   1447   HostReg address_reg;
   1448   if (address.IsConstant())
   1449   {
   1450     m_emit->Mov(GetHostReg32(RSCRATCH), static_cast<u32>(address.constant_value));
   1451     address_reg = RSCRATCH;
   1452   }
   1453   else
   1454   {
   1455     address_reg = address.host_reg;
   1456   }
   1457 
   1458   m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT);
   1459   m_emit->ldr(GetHostReg32(RARG1),
   1460               a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
   1461 
   1462   switch (size)
   1463   {
   1464     case RegSize_8:
   1465       m_emit->ldrb(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1466       break;
   1467 
   1468     case RegSize_16:
   1469       m_emit->ldrh(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1470       break;
   1471 
   1472     case RegSize_32:
   1473       m_emit->ldr(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1474       break;
   1475 
   1476     default:
   1477       UnreachableCode();
   1478       break;
   1479   }
   1480 }
   1481 
   1482 void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
   1483                                                const Value& address, RegSize size, Value& result)
   1484 {
   1485   EnsureMembaseLoaded();
   1486 
   1487   HostReg address_reg;
   1488   if (address.IsConstant())
   1489   {
   1490     m_emit->Mov(GetHostReg32(RSCRATCH), static_cast<u32>(address.constant_value));
   1491     address_reg = RSCRATCH;
   1492   }
   1493   else
   1494   {
   1495     address_reg = address.host_reg;
   1496   }
   1497 
   1498   m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT);
   1499   m_emit->ldr(GetHostReg32(RARG1),
   1500               a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
   1501 
   1502   m_register_cache.InhibitAllocation();
   1503 
   1504   void* host_pc = GetCurrentNearCodePointer();
   1505 
   1506   switch (size)
   1507   {
   1508     case RegSize_8:
   1509       m_emit->ldrb(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1510       break;
   1511 
   1512     case RegSize_16:
   1513       m_emit->ldrh(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1514       break;
   1515 
   1516     case RegSize_32:
   1517       m_emit->ldr(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1518       break;
   1519 
   1520     default:
   1521       UnreachableCode();
   1522       break;
   1523   }
   1524 
   1525   const u32 host_code_size =
   1526     static_cast<u32>(static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc)));
   1527 
   1528   // generate slowmem fallback
   1529   const void* host_slowmem_pc = GetCurrentFarCodePointer();
   1530   SwitchToFarCode();
   1531 
   1532   // we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
   1533   DebugAssert(m_delayed_cycles_add > 0);
   1534   EmitAddCPUStructField(OFFSETOF(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
   1535   m_delayed_cycles_add += Bus::RAM_READ_TICKS;
   1536 
   1537   EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, true);
   1538 
   1539   EmitAddCPUStructField(OFFSETOF(State, pending_ticks),
   1540                         Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
   1541 
   1542   // return to the block code
   1543   EmitBranch(GetCurrentNearCodePointer(), false);
   1544 
   1545   SwitchToNearCode();
   1546   m_register_cache.UninhibitAllocation();
   1547 
   1548   CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc);
   1549 }
   1550 
   1551 void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
   1552                                                const Value& address, RegSize size, Value& result, bool in_far_code)
   1553 {
   1554   if (g_settings.cpu_recompiler_memory_exceptions)
   1555   {
   1556     // NOTE: This can leave junk in the upper bits
   1557     switch (size)
   1558     {
   1559       case RegSize_8:
   1560         EmitFunctionCall(&result, &Thunks::ReadMemoryByte, address);
   1561         break;
   1562 
   1563       case RegSize_16:
   1564         EmitFunctionCall(&result, &Thunks::ReadMemoryHalfWord, address);
   1565         break;
   1566 
   1567       case RegSize_32:
   1568         EmitFunctionCall(&result, &Thunks::ReadMemoryWord, address);
   1569         break;
   1570 
   1571       default:
   1572         UnreachableCode();
   1573         break;
   1574     }
   1575 
   1576     m_register_cache.PushState();
   1577 
   1578     a32::Label load_okay;
   1579     m_emit->tst(GetHostReg32(1), 1);
   1580     m_emit->b(a32::ne, &load_okay);
   1581     EmitBranch(GetCurrentFarCodePointer());
   1582     m_emit->Bind(&load_okay);
   1583 
   1584     // load exception path
   1585     if (!in_far_code)
   1586       SwitchToFarCode();
   1587 
   1588     // cause_bits = (-result << 2) | BD | cop_n
   1589     m_emit->rsb(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 0);
   1590     m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2);
   1591     EmitOr(result.host_reg, result.host_reg,
   1592            Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException(
   1593              static_cast<Exception>(0), info.is_branch_delay_slot, false, instruction.cop.cop_n)));
   1594     EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
   1595 
   1596     EmitExceptionExit();
   1597 
   1598     if (!in_far_code)
   1599       SwitchToNearCode();
   1600 
   1601     m_register_cache.PopState();
   1602   }
   1603   else
   1604   {
   1605     switch (size)
   1606     {
   1607       case RegSize_8:
   1608         EmitFunctionCall(&result, &Thunks::UncheckedReadMemoryByte, address);
   1609         break;
   1610 
   1611       case RegSize_16:
   1612         EmitFunctionCall(&result, &Thunks::UncheckedReadMemoryHalfWord, address);
   1613         break;
   1614 
   1615       case RegSize_32:
   1616         EmitFunctionCall(&result, &Thunks::UncheckedReadMemoryWord, address);
   1617         break;
   1618 
   1619       default:
   1620         UnreachableCode();
   1621         break;
   1622     }
   1623   }
   1624 }
   1625 
   1626 void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
   1627                                                 const Value& address, RegSize size, const Value& value)
   1628 {
   1629   EnsureMembaseLoaded();
   1630 
   1631   Value actual_value = GetValueInHostRegister(value);
   1632 
   1633   HostReg address_reg;
   1634   if (address.IsConstant())
   1635   {
   1636     m_emit->Mov(GetHostReg32(RSCRATCH), static_cast<u32>(address.constant_value));
   1637     address_reg = RSCRATCH;
   1638   }
   1639   else
   1640   {
   1641     address_reg = address.host_reg;
   1642   }
   1643 
   1644   // TODO: if this gets backpatched, these instructions are wasted
   1645 
   1646   m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT);
   1647   m_emit->ldr(GetHostReg32(RARG1),
   1648               a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
   1649 
   1650   m_register_cache.InhibitAllocation();
   1651 
   1652   void* host_pc = GetCurrentNearCodePointer();
   1653 
   1654   switch (size)
   1655   {
   1656     case RegSize_8:
   1657       m_emit->strb(GetHostReg32(actual_value.host_reg),
   1658                    a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1659       break;
   1660 
   1661     case RegSize_16:
   1662       m_emit->strh(GetHostReg32(actual_value.host_reg),
   1663                    a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1664       break;
   1665 
   1666     case RegSize_32:
   1667       m_emit->str(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
   1668       break;
   1669 
   1670     default:
   1671       UnreachableCode();
   1672       break;
   1673   }
   1674 
   1675   const u32 host_code_size =
   1676     static_cast<u32>(static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc)));
   1677 
   1678   // generate slowmem fallback
   1679   void* host_slowmem_pc = GetCurrentFarCodePointer();
   1680   SwitchToFarCode();
   1681 
   1682   DebugAssert(m_delayed_cycles_add > 0);
   1683   EmitAddCPUStructField(OFFSETOF(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
   1684 
   1685   EmitStoreGuestMemorySlowmem(instruction, info, address, size, actual_value, true);
   1686 
   1687   EmitAddCPUStructField(OFFSETOF(State, pending_ticks),
   1688                         Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
   1689 
   1690   // return to the block code
   1691   EmitBranch(GetCurrentNearCodePointer(), false);
   1692 
   1693   SwitchToNearCode();
   1694   m_register_cache.UninhibitAllocation();
   1695 
   1696   CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc);
   1697 }
   1698 
   1699 void CodeGenerator::EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
   1700                                                 const Value& address, RegSize size, const Value& value,
   1701                                                 bool in_far_code)
   1702 {
   1703   Value value_in_hr = GetValueInHostRegister(value);
   1704 
   1705   if (g_settings.cpu_recompiler_memory_exceptions)
   1706   {
   1707     Assert(!in_far_code);
   1708 
   1709     Value result = m_register_cache.AllocateScratch(RegSize_32);
   1710     switch (size)
   1711     {
   1712       case RegSize_8:
   1713         EmitFunctionCall(&result, &Thunks::WriteMemoryByte, address, value_in_hr);
   1714         break;
   1715 
   1716       case RegSize_16:
   1717         EmitFunctionCall(&result, &Thunks::WriteMemoryHalfWord, address, value_in_hr);
   1718         break;
   1719 
   1720       case RegSize_32:
   1721         EmitFunctionCall(&result, &Thunks::WriteMemoryWord, address, value_in_hr);
   1722         break;
   1723 
   1724       default:
   1725         UnreachableCode();
   1726         break;
   1727     }
   1728 
   1729     m_register_cache.PushState();
   1730 
   1731     a32::Label store_okay;
   1732     m_emit->tst(GetHostReg32(result.host_reg), 1);
   1733     m_emit->b(a32::eq, &store_okay);
   1734     EmitBranch(GetCurrentFarCodePointer());
   1735     m_emit->Bind(&store_okay);
   1736 
   1737     // store exception path
   1738     if (!in_far_code)
   1739       SwitchToFarCode();
   1740 
   1741     // cause_bits = (result << 2) | BD | cop_n
   1742     m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2);
   1743     EmitOr(result.host_reg, result.host_reg,
   1744            Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException(
   1745              static_cast<Exception>(0), info.is_branch_delay_slot, false, instruction.cop.cop_n)));
   1746     EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
   1747 
   1748     if (!in_far_code)
   1749       EmitExceptionExit();
   1750     SwitchToNearCode();
   1751 
   1752     m_register_cache.PopState();
   1753   }
   1754   else
   1755   {
   1756     switch (size)
   1757     {
   1758       case RegSize_8:
   1759         EmitFunctionCall(nullptr, &Thunks::UncheckedWriteMemoryByte, address, value_in_hr);
   1760         break;
   1761 
   1762       case RegSize_16:
   1763         EmitFunctionCall(nullptr, &Thunks::UncheckedWriteMemoryHalfWord, address, value_in_hr);
   1764         break;
   1765 
   1766       case RegSize_32:
   1767         EmitFunctionCall(nullptr, &Thunks::UncheckedWriteMemoryWord, address, value_in_hr);
   1768         break;
   1769 
   1770       default:
   1771         UnreachableCode();
   1772         break;
   1773     }
   1774   }
   1775 }
   1776 
   1777 void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi)
   1778 {
   1779   DEV_LOG("Backpatching {} (guest PC 0x{:08X}) to slowmem at {}", host_pc, lbi.guest_pc, lbi.thunk_address);
   1780 
   1781   // turn it into a jump to the slowmem handler
   1782   vixl::aarch32::MacroAssembler emit(static_cast<vixl::byte*>(host_pc), lbi.code_size, a32::A32);
   1783 
   1784   // check jump distance
   1785   const s32 displacement = armGetPCDisplacement(host_pc, lbi.thunk_address);
   1786   if (!armIsPCDisplacementInImmediateRange(displacement))
   1787   {
   1788     armMoveAddressToReg(&emit, GetHostReg32(RSCRATCH), lbi.thunk_address);
   1789     emit.bx(GetHostReg32(RSCRATCH));
   1790   }
   1791   else
   1792   {
   1793     a32::Label label(displacement + emit.GetCursorOffset());
   1794     emit.b(&label);
   1795   }
   1796 
   1797   const s32 nops = (static_cast<s32>(lbi.code_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
   1798   Assert(nops >= 0);
   1799   for (s32 i = 0; i < nops; i++)
   1800     emit.nop();
   1801 
   1802   MemMap::FlushInstructionCache(host_pc, lbi.code_size);
   1803 }
   1804 
   1805 void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)
   1806 {
   1807   EmitLoadGlobalAddress(RSCRATCH, ptr);
   1808   switch (size)
   1809   {
   1810     case RegSize_8:
   1811       m_emit->Ldrb(GetHostReg8(host_reg), a32::MemOperand(GetHostReg32(RSCRATCH)));
   1812       break;
   1813 
   1814     case RegSize_16:
   1815       m_emit->Ldrh(GetHostReg16(host_reg), a32::MemOperand(GetHostReg32(RSCRATCH)));
   1816       break;
   1817 
   1818     case RegSize_32:
   1819       m_emit->Ldr(GetHostReg32(host_reg), a32::MemOperand(GetHostReg32(RSCRATCH)));
   1820       break;
   1821 
   1822     default:
   1823       UnreachableCode();
   1824       break;
   1825   }
   1826 }
   1827 
   1828 void CodeGenerator::EmitStoreGlobal(void* ptr, const Value& value)
   1829 {
   1830   Value value_in_hr = GetValueInHostRegister(value);
   1831 
   1832   EmitLoadGlobalAddress(RSCRATCH, ptr);
   1833   switch (value.size)
   1834   {
   1835     case RegSize_8:
   1836       m_emit->Strb(GetHostReg8(value_in_hr), a32::MemOperand(GetHostReg32(RSCRATCH)));
   1837       break;
   1838 
   1839     case RegSize_16:
   1840       m_emit->Strh(GetHostReg16(value_in_hr), a32::MemOperand(GetHostReg32(RSCRATCH)));
   1841       break;
   1842 
   1843     case RegSize_32:
   1844       m_emit->Str(GetHostReg32(value_in_hr), a32::MemOperand(GetHostReg32(RSCRATCH)));
   1845       break;
   1846 
   1847     default:
   1848       UnreachableCode();
   1849       break;
   1850   }
   1851 }
   1852 
   1853 void CodeGenerator::EmitFlushInterpreterLoadDelay()
   1854 {
   1855   Value reg = Value::FromHostReg(&m_register_cache, 0, RegSize_32);
   1856   Value value = Value::FromHostReg(&m_register_cache, 1, RegSize_32);
   1857 
   1858   const a32::MemOperand load_delay_reg(GetCPUPtrReg(), OFFSETOF(State, load_delay_reg));
   1859   const a32::MemOperand load_delay_value(GetCPUPtrReg(), OFFSETOF(State, load_delay_value));
   1860   const a32::MemOperand regs_base(GetCPUPtrReg(), OFFSETOF(State, regs.r[0]));
   1861 
   1862   a32::Label skip_flush;
   1863 
   1864   // reg = load_delay_reg
   1865   m_emit->Ldrb(GetHostReg32(reg), load_delay_reg);
   1866 
   1867   // if load_delay_reg == Reg::count goto skip_flush
   1868   m_emit->Cmp(GetHostReg32(reg), static_cast<u8>(Reg::count));
   1869   m_emit->B(a32::eq, &skip_flush);
   1870 
   1871   // value = load_delay_value
   1872   m_emit->Ldr(GetHostReg32(value), load_delay_value);
   1873 
   1874   // reg = offset(r[0] + reg << 2)
   1875   m_emit->Lsl(GetHostReg32(reg), GetHostReg32(reg), 2);
   1876   m_emit->Add(GetHostReg32(reg), GetHostReg32(reg), OFFSETOF(State, regs.r[0]));
   1877 
   1878   // r[reg] = value
   1879   m_emit->Str(GetHostReg32(value), a32::MemOperand(GetCPUPtrReg(), GetHostReg32(reg)));
   1880 
   1881   // load_delay_reg = Reg::count
   1882   m_emit->Mov(GetHostReg32(reg), static_cast<u8>(Reg::count));
   1883   m_emit->Strb(GetHostReg32(reg), load_delay_reg);
   1884 
   1885   m_emit->Bind(&skip_flush);
   1886 }
   1887 
   1888 void CodeGenerator::EmitMoveNextInterpreterLoadDelay()
   1889 {
   1890   Value reg = Value::FromHostReg(&m_register_cache, 0, RegSize_32);
   1891   Value value = Value::FromHostReg(&m_register_cache, 1, RegSize_32);
   1892 
   1893   const a32::MemOperand load_delay_reg(GetCPUPtrReg(), OFFSETOF(State, load_delay_reg));
   1894   const a32::MemOperand load_delay_value(GetCPUPtrReg(), OFFSETOF(State, load_delay_value));
   1895   const a32::MemOperand next_load_delay_reg(GetCPUPtrReg(), OFFSETOF(State, next_load_delay_reg));
   1896   const a32::MemOperand next_load_delay_value(GetCPUPtrReg(), OFFSETOF(State, next_load_delay_value));
   1897 
   1898   m_emit->ldrb(GetHostReg32(reg), next_load_delay_reg);
   1899   m_emit->ldr(GetHostReg32(value), next_load_delay_value);
   1900   m_emit->strb(GetHostReg32(reg), load_delay_reg);
   1901   m_emit->str(GetHostReg32(value), load_delay_value);
   1902   m_emit->Mov(GetHostReg32(reg), static_cast<u8>(Reg::count));
   1903   m_emit->strb(GetHostReg32(reg), next_load_delay_reg);
   1904 }
   1905 
   1906 void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg)
   1907 {
   1908   if (!m_load_delay_dirty)
   1909     return;
   1910 
   1911   const a32::MemOperand load_delay_reg(GetCPUPtrReg(), OFFSETOF(State, load_delay_reg));
   1912   Value temp = Value::FromHostReg(&m_register_cache, RSCRATCH, RegSize_8);
   1913 
   1914   a32::Label skip_cancel;
   1915 
   1916   // if load_delay_reg != reg goto skip_cancel
   1917   m_emit->ldrb(GetHostReg8(temp), load_delay_reg);
   1918   m_emit->cmp(GetHostReg8(temp), static_cast<u8>(reg));
   1919   m_emit->B(a32::ne, &skip_cancel);
   1920 
   1921   // load_delay_reg = Reg::count
   1922   m_emit->Mov(GetHostReg8(temp), static_cast<u8>(Reg::count));
   1923   m_emit->strb(GetHostReg8(temp), load_delay_reg);
   1924 
   1925   m_emit->Bind(&skip_cancel);
   1926 }
   1927 
   1928 void CodeGenerator::EmitICacheCheckAndUpdate()
   1929 {
   1930   if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
   1931   {
   1932     if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
   1933     {
   1934       armEmitFarLoad(m_emit, GetHostReg32(RARG2), GetFetchMemoryAccessTimePtr());
   1935       m_emit->ldr(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
   1936       m_emit->Mov(GetHostReg32(RARG3), m_block->size);
   1937       m_emit->mul(GetHostReg32(RARG2), GetHostReg32(RARG2), GetHostReg32(RARG3));
   1938       m_emit->add(GetHostReg32(RARG1), GetHostReg32(RARG1), GetHostReg32(RARG2));
   1939       m_emit->str(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
   1940     }
   1941     else
   1942     {
   1943       EmitAddCPUStructField(OFFSETOF(State, pending_ticks),
   1944                             Value::FromConstantU32(static_cast<u32>(m_block->uncached_fetch_ticks)));
   1945     }
   1946   }
   1947   else if (m_block->icache_line_count > 0)
   1948   {
   1949     const auto& ticks_reg = a32::r0;
   1950     const auto& current_tag_reg = a32::r1;
   1951     const auto& existing_tag_reg = a32::r2;
   1952 
   1953     VirtualMemoryAddress current_pc = m_pc & ICACHE_TAG_ADDRESS_MASK;
   1954     m_emit->ldr(ticks_reg, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
   1955     m_emit->Mov(current_tag_reg, current_pc);
   1956 
   1957     for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
   1958     {
   1959       const TickCount fill_ticks = GetICacheFillTicks(current_pc);
   1960       if (fill_ticks <= 0)
   1961         continue;
   1962 
   1963       const u32 line = GetICacheLine(current_pc);
   1964       const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
   1965 
   1966       a32::Label cache_hit;
   1967       m_emit->ldr(existing_tag_reg, a32::MemOperand(GetCPUPtrReg(), offset));
   1968       m_emit->cmp(existing_tag_reg, current_tag_reg);
   1969       m_emit->B(a32::eq, &cache_hit);
   1970 
   1971       m_emit->str(current_tag_reg, a32::MemOperand(GetCPUPtrReg(), offset));
   1972       EmitAdd(0, 0, Value::FromConstantU32(static_cast<u32>(fill_ticks)), false);
   1973       m_emit->Bind(&cache_hit);
   1974 
   1975       if (i != (m_block->icache_line_count - 1))
   1976         m_emit->add(current_tag_reg, current_tag_reg, ICACHE_LINE_SIZE);
   1977     }
   1978 
   1979     m_emit->str(ticks_reg, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
   1980   }
   1981 }
   1982 
   1983 void CodeGenerator::EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
   1984 {
   1985   // store it first to reduce code size, because we can offset
   1986   armMoveAddressToReg(m_emit, GetHostReg32(RARG1), ram_ptr);
   1987   armMoveAddressToReg(m_emit, GetHostReg32(RARG2), shadow_ptr);
   1988 
   1989   u32 offset = 0;
   1990   a32::Label block_changed;
   1991 
   1992 #if 0
   1993   /* TODO: Vectorize
   1994 #include <arm_neon.h>
   1995 #include <stdint.h>
   1996 
   1997 bool foo(const void* a, const void* b)
   1998 {
   1999     uint8x16_t v1 = vld1q_u8((const uint8_t*)a);
   2000     uint8x16_t v2 = vld1q_u8((const uint8_t*)b);
   2001     uint8x16_t v3 = vld1q_u8((const uint8_t*)a + 16);
   2002     uint8x16_t v4 = vld1q_u8((const uint8_t*)a + 16);
   2003     uint8x16_t r = vceqq_u8(v1, v2);
   2004     uint8x16_t r2 = vceqq_u8(v2, v3);
   2005     uint8x16_t r3 = vandq_u8(r, r2);
   2006     uint32x2_t rr = vpmin_u32(vget_low_u32(vreinterpretq_u32_u8(r3)), vget_high_u32(vreinterpretq_u32_u8(r3)));
   2007     if ((vget_lane_u32(rr, 0) & vget_lane_u32(rr, 1)) != 0xFFFFFFFFu)
   2008         return false;
   2009     else
   2010         return true;
   2011 }
   2012 */
   2013   bool first = true;
   2014 
   2015   while (size >= 16)
   2016   {
   2017     const a32::VRegister vtmp = a32::v2.V4S();
   2018     const a32::VRegister dst = first ? a32::v0.V4S() : a32::v1.V4S();
   2019     m_emit->ldr(dst, a32::MemOperand(RXARG1, offset));
   2020     m_emit->ldr(vtmp, a32::MemOperand(RXARG2, offset));
   2021     m_emit->cmeq(dst, dst, vtmp);
   2022     if (!first)
   2023       m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B());
   2024     else
   2025       first = false;
   2026 
   2027     offset += 16;
   2028     size -= 16;
   2029   }
   2030 
   2031   if (!first)
   2032   {
   2033     // TODO: make sure this doesn't choke on ffffffff
   2034     m_emit->uminv(a32::s0, a32::v0.V4S());
   2035     m_emit->fcmp(a32::s0, 0.0);
   2036     m_emit->b(&block_changed, a32::eq);
   2037   }
   2038 #endif
   2039 
   2040   while (size >= 4)
   2041   {
   2042     m_emit->ldr(GetHostReg32(RARG3), a32::MemOperand(GetHostReg32(RARG1), offset));
   2043     m_emit->ldr(GetHostReg32(RARG4), a32::MemOperand(GetHostReg32(RARG2), offset));
   2044     m_emit->cmp(GetHostReg32(RARG3), GetHostReg32(RARG4));
   2045     m_emit->b(a32::ne, &block_changed);
   2046     offset += 4;
   2047     size -= 4;
   2048   }
   2049 
   2050   DebugAssert(size == 0);
   2051 
   2052   a32::Label block_unchanged;
   2053   m_emit->b(&block_unchanged);
   2054   m_emit->bind(&block_changed);
   2055   armEmitJmp(m_emit, CodeCache::g_discard_and_recompile_block, false);
   2056   m_emit->bind(&block_unchanged);
   2057 }
   2058 
   2059 void CodeGenerator::EmitStallUntilGTEComplete()
   2060 {
   2061   static_assert(OFFSETOF(State, pending_ticks) + sizeof(u32) == OFFSETOF(State, gte_completion_tick));
   2062 
   2063   m_emit->ldr(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
   2064   m_emit->ldr(GetHostReg32(RARG2), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, gte_completion_tick)));
   2065 
   2066   if (m_delayed_cycles_add > 0)
   2067   {
   2068     m_emit->Add(GetHostReg32(RARG1), GetHostReg32(RARG1), static_cast<u32>(m_delayed_cycles_add));
   2069     m_delayed_cycles_add = 0;
   2070   }
   2071 
   2072   m_emit->cmp(GetHostReg32(RARG2), GetHostReg32(RARG1));
   2073   m_emit->mov(a32::hi, GetHostReg32(RARG1), GetHostReg32(RARG2));
   2074   m_emit->str(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
   2075 }
   2076 
   2077 void CodeGenerator::EmitBranch(const void* address, bool allow_scratch)
   2078 {
   2079   const s32 displacement = armGetPCDisplacement(GetCurrentCodePointer(), address);
   2080   if (armIsPCDisplacementInImmediateRange(displacement))
   2081   {
   2082     a32::Label label(displacement + m_emit->GetCursorOffset());
   2083     m_emit->b(&label);
   2084     return;
   2085   }
   2086 
   2087   m_emit->Mov(GetHostReg32(RSCRATCH), reinterpret_cast<uintptr_t>(address));
   2088   m_emit->bx(GetHostReg32(RSCRATCH));
   2089 }
   2090 
   2091 void CodeGenerator::EmitBranch(LabelType* label)
   2092 {
   2093   m_emit->b(label);
   2094 }
   2095 
   2096 static a32::Condition TranslateCondition(Condition condition, bool invert)
   2097 {
   2098   switch (condition)
   2099   {
   2100     case Condition::Always:
   2101       return a32::Condition::None();
   2102 
   2103     case Condition::NotEqual:
   2104     case Condition::NotZero:
   2105       return invert ? a32::eq : a32::ne;
   2106 
   2107     case Condition::Equal:
   2108     case Condition::Zero:
   2109       return invert ? a32::ne : a32::eq;
   2110 
   2111     case Condition::Overflow:
   2112       return invert ? a32::vc : a32::vs;
   2113 
   2114     case Condition::Greater:
   2115       return invert ? a32::le : a32::gt;
   2116 
   2117     case Condition::GreaterEqual:
   2118       return invert ? a32::lt : a32::ge;
   2119 
   2120     case Condition::Less:
   2121       return invert ? a32::ge : a32::lt;
   2122 
   2123     case Condition::LessEqual:
   2124       return invert ? a32::gt : a32::le;
   2125 
   2126     case Condition::Negative:
   2127       return invert ? a32::pl : a32::mi;
   2128 
   2129     case Condition::PositiveOrZero:
   2130       return invert ? a32::mi : a32::pl;
   2131 
   2132     case Condition::Above:
   2133       return invert ? a32::ls : a32::hi;
   2134 
   2135     case Condition::AboveEqual:
   2136       return invert ? a32::cc : a32::cs;
   2137 
   2138     case Condition::Below:
   2139       return invert ? a32::cs : a32::cc;
   2140 
   2141     case Condition::BelowEqual:
   2142       return invert ? a32::hi : a32::ls;
   2143 
   2144     default:
   2145       UnreachableCode();
   2146       return a32::Condition::Never();
   2147   }
   2148 }
   2149 
   2150 void CodeGenerator::EmitConditionalBranch(Condition condition, bool invert, HostReg value, RegSize size,
   2151                                           LabelType* label)
   2152 {
   2153   switch (condition)
   2154   {
   2155     case Condition::NotEqual:
   2156     case Condition::Equal:
   2157     case Condition::Overflow:
   2158     case Condition::Greater:
   2159     case Condition::GreaterEqual:
   2160     case Condition::LessEqual:
   2161     case Condition::Less:
   2162     case Condition::Above:
   2163     case Condition::AboveEqual:
   2164     case Condition::Below:
   2165     case Condition::BelowEqual:
   2166       Panic("Needs a comparison value");
   2167       return;
   2168 
   2169     case Condition::Negative:
   2170     case Condition::PositiveOrZero:
   2171     {
   2172       switch (size)
   2173       {
   2174         case RegSize_8:
   2175           m_emit->tst(GetHostReg8(value), GetHostReg8(value));
   2176           break;
   2177         case RegSize_16:
   2178           m_emit->tst(GetHostReg16(value), GetHostReg16(value));
   2179           break;
   2180         case RegSize_32:
   2181           m_emit->tst(GetHostReg32(value), GetHostReg32(value));
   2182           break;
   2183         default:
   2184           UnreachableCode();
   2185           break;
   2186       }
   2187 
   2188       EmitConditionalBranch(condition, invert, label);
   2189       return;
   2190     }
   2191 
   2192     case Condition::NotZero:
   2193     {
   2194       switch (size)
   2195       {
   2196         case RegSize_8:
   2197           m_emit->tst(GetHostReg8(value), GetHostReg8(value));
   2198           m_emit->b(a32::ne, label);
   2199           break;
   2200         case RegSize_16:
   2201           m_emit->tst(GetHostReg8(value), GetHostReg8(value));
   2202           m_emit->b(a32::ne, label);
   2203           break;
   2204         case RegSize_32:
   2205           m_emit->tst(GetHostReg8(value), GetHostReg8(value));
   2206           m_emit->b(a32::ne, label);
   2207           break;
   2208         default:
   2209           UnreachableCode();
   2210           break;
   2211       }
   2212 
   2213       return;
   2214     }
   2215 
   2216     case Condition::Zero:
   2217     {
   2218       switch (size)
   2219       {
   2220         case RegSize_8:
   2221           m_emit->tst(GetHostReg8(value), GetHostReg8(value));
   2222           m_emit->b(a32::eq, label);
   2223           break;
   2224         case RegSize_16:
   2225           m_emit->tst(GetHostReg8(value), GetHostReg8(value));
   2226           m_emit->b(a32::eq, label);
   2227           break;
   2228         case RegSize_32:
   2229           m_emit->tst(GetHostReg8(value), GetHostReg8(value));
   2230           m_emit->b(a32::eq, label);
   2231           break;
   2232         default:
   2233           UnreachableCode();
   2234           break;
   2235       }
   2236 
   2237       return;
   2238     }
   2239 
   2240     case Condition::Always:
   2241       m_emit->b(label);
   2242       return;
   2243 
   2244     default:
   2245       UnreachableCode();
   2246       return;
   2247   }
   2248 }
   2249 
   2250 void CodeGenerator::EmitConditionalBranch(Condition condition, bool invert, HostReg lhs, const Value& rhs,
   2251                                           LabelType* label)
   2252 {
   2253   switch (condition)
   2254   {
   2255     case Condition::NotEqual:
   2256     case Condition::Equal:
   2257     case Condition::Overflow:
   2258     case Condition::Greater:
   2259     case Condition::GreaterEqual:
   2260     case Condition::LessEqual:
   2261     case Condition::Less:
   2262     case Condition::Above:
   2263     case Condition::AboveEqual:
   2264     case Condition::Below:
   2265     case Condition::BelowEqual:
   2266     {
   2267       EmitCmp(lhs, rhs);
   2268       EmitConditionalBranch(condition, invert, label);
   2269       return;
   2270     }
   2271 
   2272     case Condition::Negative:
   2273     case Condition::PositiveOrZero:
   2274     case Condition::NotZero:
   2275     case Condition::Zero:
   2276     {
   2277       Assert(!rhs.IsValid() || (rhs.IsConstant() && rhs.GetS64ConstantValue() == 0));
   2278       EmitConditionalBranch(condition, invert, lhs, rhs.size, label);
   2279       return;
   2280     }
   2281 
   2282     case Condition::Always:
   2283       m_emit->b(label);
   2284       return;
   2285 
   2286     default:
   2287       UnreachableCode();
   2288       return;
   2289   }
   2290 }
   2291 
   2292 void CodeGenerator::EmitConditionalBranch(Condition condition, bool invert, LabelType* label)
   2293 {
   2294   if (condition == Condition::Always)
   2295     m_emit->b(label);
   2296   else
   2297     m_emit->b(TranslateCondition(condition, invert), label);
   2298 }
   2299 
   2300 void CodeGenerator::EmitBranchIfBitClear(HostReg reg, RegSize size, u8 bit, LabelType* label)
   2301 {
   2302   switch (size)
   2303   {
   2304     case RegSize_8:
   2305     case RegSize_16:
   2306     case RegSize_32:
   2307       m_emit->tst(GetHostReg32(reg), static_cast<s32>(1u << bit));
   2308       m_emit->b(a32::eq, label);
   2309       break;
   2310 
   2311     default:
   2312       UnreachableCode();
   2313       break;
   2314   }
   2315 }
   2316 
   2317 void CodeGenerator::EmitBindLabel(LabelType* label)
   2318 {
   2319   m_emit->Bind(label);
   2320 }
   2321 
   2322 void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr)
   2323 {
   2324   m_emit->Mov(GetHostReg32(host_reg), reinterpret_cast<uintptr_t>(ptr));
   2325 }
   2326 
   2327 } // namespace CPU::Recompiler
   2328 
   2329 #endif // CPU_ARCH_ARM32