cpu_newrec_compiler_x64.cpp (65933B)
1 // SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com> 2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) 3 4 #include "cpu_newrec_compiler_x64.h" 5 #include "common/align.h" 6 #include "common/assert.h" 7 #include "common/log.h" 8 #include "common/string_util.h" 9 #include "cpu_code_cache_private.h" 10 #include "cpu_core_private.h" 11 #include "cpu_pgxp.h" 12 #include "cpu_recompiler_thunks.h" 13 #include "cpu_recompiler_types.h" 14 #include "gte.h" 15 #include "settings.h" 16 #include "timing_event.h" 17 #include <limits> 18 19 #ifdef CPU_ARCH_X64 20 21 Log_SetChannel(CPU::NewRec); 22 23 #define RMEMBASE cg->rbx 24 #define RSTATE cg->rbp 25 26 // #define PTR(x) (cg->rip + (x)) 27 #define PTR(x) (RSTATE + (u32)(((u8*)(x)) - ((u8*)&g_state))) 28 29 // PGXP TODO: LWL etc, MFC0 30 // PGXP TODO: Spyro 1 level gates have issues. 31 32 static constexpr u32 BACKPATCH_JMP_SIZE = 5; 33 34 // on win32, we need to reserve an additional 32 bytes shadow space when calling out to C 35 #ifdef _WIN32 36 static constexpr u32 STACK_SHADOW_SIZE = 32; 37 #else 38 static constexpr u32 STACK_SHADOW_SIZE = 0; 39 #endif 40 41 using namespace Xbyak; 42 43 using CPU::Recompiler::IsCallerSavedRegister; 44 45 // TODO: try using a pointer to state instead of rip-relative.. it might end up faster due to smaller code 46 47 namespace CPU::NewRec { 48 X64Compiler s_instance; 49 Compiler* g_compiler = &s_instance; 50 } // namespace CPU::NewRec 51 52 CPU::NewRec::X64Compiler::X64Compiler() = default; 53 54 CPU::NewRec::X64Compiler::~X64Compiler() = default; 55 56 void CPU::NewRec::X64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, 57 u8* far_code_buffer, u32 far_code_space) 58 { 59 Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space); 60 61 // TODO: don't recreate this every time.. 62 DebugAssert(!m_emitter && !m_far_emitter && !cg); 63 m_emitter = std::make_unique<Xbyak::CodeGenerator>(code_buffer_space, code_buffer); 64 m_far_emitter = std::make_unique<Xbyak::CodeGenerator>(far_code_space, far_code_buffer); 65 cg = m_emitter.get(); 66 67 // Need to wipe it out so it's correct when toggling fastmem. 68 m_host_regs = {}; 69 70 const u32 membase_idx = CodeCache::IsUsingFastmem() ? static_cast<u32>(RMEMBASE.getIdx()) : NUM_HOST_REGS; 71 const u32 cpu_idx = static_cast<u32>(RSTATE.getIdx()); 72 for (u32 i = 0; i < NUM_HOST_REGS; i++) 73 { 74 HostRegAlloc& ra = m_host_regs[i]; 75 76 if (i == static_cast<u32>(RWRET.getIdx()) || i == static_cast<u32>(RWARG1.getIdx()) || 77 i == static_cast<u32>(RWARG2.getIdx()) || i == static_cast<u32>(RWARG3.getIdx()) || 78 i == static_cast<u32>(cg->rsp.getIdx()) || i == cpu_idx || i == membase_idx || 79 i == static_cast<u32>(cg->ecx.getIdx()) /* keep ecx free for shifts, maybe use BMI? */) 80 { 81 continue; 82 } 83 84 ra.flags = HR_USABLE | (IsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED); 85 } 86 } 87 88 void CPU::NewRec::X64Compiler::SwitchToFarCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*)) 89 { 90 DebugAssert(cg == m_emitter.get()); 91 if (emit_jump) 92 { 93 const void* fcptr = m_far_emitter->getCurr<const void*>(); 94 (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr); 95 } 96 cg = m_far_emitter.get(); 97 } 98 99 void CPU::NewRec::X64Compiler::SwitchToNearCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*)) 100 { 101 DebugAssert(cg == m_far_emitter.get()); 102 if (emit_jump) 103 { 104 const void* fcptr = m_emitter->getCurr<const void*>(); 105 (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr); 106 } 107 cg = m_emitter.get(); 108 } 109 110 void CPU::NewRec::X64Compiler::BeginBlock() 111 { 112 Compiler::BeginBlock(); 113 114 #if 0 115 if (m_block->pc == 0xBFC06F0C) 116 { 117 //__debugbreak(); 118 cg->db(0xcc); 119 } 120 #endif 121 122 #if 0 123 cg->nop(); 124 cg->mov(RWARG1, m_block->pc); 125 cg->nop(); 126 #endif 127 } 128 129 void CPU::NewRec::X64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) 130 { 131 // store it first to reduce code size, because we can offset 132 cg->mov(RXARG1, static_cast<size_t>(reinterpret_cast<uintptr_t>(ram_ptr))); 133 cg->mov(RXARG2, static_cast<size_t>(reinterpret_cast<uintptr_t>(shadow_ptr))); 134 135 bool first = true; 136 u32 offset = 0; 137 while (size >= 16) 138 { 139 const Xbyak::Xmm& dst = first ? cg->xmm0 : cg->xmm1; 140 cg->movups(dst, cg->xword[RXARG1 + offset]); 141 cg->pcmpeqd(dst, cg->xword[RXARG2 + offset]); 142 if (!first) 143 cg->pand(cg->xmm0, dst); 144 else 145 first = false; 146 147 offset += 16; 148 size -= 16; 149 } 150 151 // TODO: better codegen for 16 byte aligned blocks 152 if (!first) 153 { 154 cg->movmskps(cg->eax, cg->xmm0); 155 cg->cmp(cg->eax, 0xf); 156 cg->jne(CodeCache::g_discard_and_recompile_block); 157 } 158 159 while (size >= 8) 160 { 161 cg->mov(RXARG3, cg->qword[RXARG1 + offset]); 162 cg->cmp(RXARG3, cg->qword[RXARG2 + offset]); 163 cg->jne(CodeCache::g_discard_and_recompile_block); 164 offset += 8; 165 size -= 8; 166 } 167 168 while (size >= 4) 169 { 170 cg->mov(RWARG3, cg->dword[RXARG1 + offset]); 171 cg->cmp(RWARG3, cg->dword[RXARG2 + offset]); 172 cg->jne(CodeCache::g_discard_and_recompile_block); 173 offset += 4; 174 size -= 4; 175 } 176 177 DebugAssert(size == 0); 178 } 179 180 void CPU::NewRec::X64Compiler::GenerateICacheCheckAndUpdate() 181 { 182 if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache)) 183 { 184 if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) 185 { 186 cg->mov(cg->eax, m_block->size); 187 cg->mul(cg->dword[cg->rip + GetFetchMemoryAccessTimePtr()]); 188 cg->add(cg->dword[PTR(&g_state.pending_ticks)], cg->eax); 189 } 190 else 191 { 192 cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(m_block->uncached_fetch_ticks)); 193 } 194 } 195 else if (m_block->icache_line_count > 0) 196 { 197 cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]); 198 199 // TODO: Vectorize this... 200 VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK; 201 for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) 202 { 203 const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc); 204 const TickCount fill_ticks = GetICacheFillTicks(current_pc); 205 if (fill_ticks <= 0) 206 continue; 207 208 const u32 line = GetICacheLine(current_pc); 209 const u32 offset = (line * sizeof(u32)); 210 Xbyak::Label cache_hit; 211 212 cg->cmp(cg->dword[RXARG1 + offset], tag); 213 cg->je(cache_hit); 214 cg->mov(cg->dword[RXARG1 + offset], tag); 215 cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(fill_ticks)); 216 cg->L(cache_hit); 217 } 218 } 219 } 220 221 void CPU::NewRec::X64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/, 222 s32 arg3reg /*= -1*/) 223 { 224 if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.getIdx())) 225 cg->mov(RXARG1, Reg64(arg1reg)); 226 if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.getIdx())) 227 cg->mov(RXARG2, Reg64(arg2reg)); 228 if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.getIdx())) 229 cg->mov(RXARG3, Reg64(arg3reg)); 230 cg->call(func); 231 } 232 233 void CPU::NewRec::X64Compiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test) 234 { 235 if (newpc.has_value()) 236 { 237 if (m_dirty_pc || m_compiler_pc != newpc) 238 cg->mov(cg->dword[PTR(&g_state.pc)], newpc.value()); 239 } 240 m_dirty_pc = false; 241 242 // flush regs 243 Flush(FLUSH_END_BLOCK); 244 EndAndLinkBlock(newpc, do_event_test, false); 245 } 246 247 void CPU::NewRec::X64Compiler::EndBlockWithException(Exception excode) 248 { 249 // flush regs, but not pc, it's going to get overwritten 250 // flush cycles because of the GTE instruction stuff... 251 Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL); 252 253 // TODO: flush load delay 254 // TODO: break for pcdrv 255 256 cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false, 257 inst->cop.cop_n)); 258 cg->mov(RWARG2, m_current_instruction_pc); 259 cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)); 260 m_dirty_pc = false; 261 262 EndAndLinkBlock(std::nullopt, true, false); 263 } 264 265 void CPU::NewRec::X64Compiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, 266 bool force_run_events) 267 { 268 // event test 269 // pc should've been flushed 270 DebugAssert(!m_dirty_pc && !m_block_ended); 271 m_block_ended = true; 272 273 // TODO: try extracting this to a function 274 275 // save cycles for event test 276 const TickCount cycles = std::exchange(m_cycles, 0); 277 278 // fast path when not doing an event test 279 if (!do_event_test && m_gte_done_cycle <= cycles) 280 { 281 if (cycles == 1) 282 cg->inc(cg->dword[PTR(&g_state.pending_ticks)]); 283 else if (cycles > 0) 284 cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles); 285 286 if (force_run_events) 287 { 288 cg->jmp(CodeCache::g_run_events_and_dispatch); 289 return; 290 } 291 } 292 else 293 { 294 // pending_ticks += cycles 295 // if (pending_ticks >= downcount) { dispatch_event(); } 296 if (do_event_test || cycles > 0 || m_gte_done_cycle > cycles) 297 cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); 298 if (cycles > 0) 299 cg->add(RWARG1, cycles); 300 if (m_gte_done_cycle > cycles) 301 { 302 cg->mov(RWARG2, RWARG1); 303 ((m_gte_done_cycle - cycles) == 1) ? cg->inc(RWARG2) : cg->add(RWARG2, m_gte_done_cycle - cycles); 304 cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG2); 305 } 306 if (do_event_test) 307 cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]); 308 if (cycles > 0) 309 cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); 310 if (do_event_test) 311 cg->jge(CodeCache::g_run_events_and_dispatch); 312 } 313 314 // jump to dispatcher or next block 315 if (!newpc.has_value()) 316 { 317 cg->jmp(CodeCache::g_dispatcher); 318 } 319 else 320 { 321 if (newpc.value() == m_block->pc) 322 { 323 // Special case: ourselves! No need to backlink then. 324 DEBUG_LOG("Linking block at {:08X} to self", m_block->pc); 325 cg->jmp(cg->getCode()); 326 } 327 else 328 { 329 const void* target = CodeCache::CreateBlockLink(m_block, cg->getCurr<void*>(), newpc.value()); 330 cg->jmp(target, CodeGenerator::T_NEAR); 331 } 332 } 333 } 334 335 const void* CPU::NewRec::X64Compiler::EndCompile(u32* code_size, u32* far_code_size) 336 { 337 const void* code = m_emitter->getCode(); 338 *code_size = static_cast<u32>(m_emitter->getSize()); 339 *far_code_size = static_cast<u32>(m_far_emitter->getSize()); 340 cg = nullptr; 341 m_far_emitter.reset(); 342 m_emitter.reset(); 343 return code; 344 } 345 346 const void* CPU::NewRec::X64Compiler::GetCurrentCodePointer() 347 { 348 return cg->getCurr(); 349 } 350 351 const char* CPU::NewRec::X64Compiler::GetHostRegName(u32 reg) const 352 { 353 static constexpr std::array<const char*, 16> reg64_names = { 354 {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"}}; 355 return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN"; 356 } 357 358 void CPU::NewRec::X64Compiler::LoadHostRegWithConstant(u32 reg, u32 val) 359 { 360 cg->mov(Reg32(reg), val); 361 } 362 363 void CPU::NewRec::X64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr) 364 { 365 cg->mov(Reg32(reg), cg->dword[PTR(ptr)]); 366 } 367 368 void CPU::NewRec::X64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr) 369 { 370 cg->mov(cg->dword[PTR(ptr)], Reg32(reg)); 371 } 372 373 void CPU::NewRec::X64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr) 374 { 375 cg->mov(cg->dword[PTR(ptr)], val); 376 } 377 378 void CPU::NewRec::X64Compiler::CopyHostReg(u32 dst, u32 src) 379 { 380 if (src != dst) 381 cg->mov(Reg32(dst), Reg32(src)); 382 } 383 384 Xbyak::Address CPU::NewRec::X64Compiler::MipsPtr(Reg r) const 385 { 386 DebugAssert(r < Reg::count); 387 return cg->dword[PTR(&g_state.regs.r[static_cast<u32>(r)])]; 388 } 389 390 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegD(CompileFlags cf) const 391 { 392 DebugAssert(cf.valid_host_d); 393 return Reg32(cf.host_d); 394 } 395 396 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegS(CompileFlags cf) const 397 { 398 DebugAssert(cf.valid_host_s); 399 return Reg32(cf.host_s); 400 } 401 402 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegT(CompileFlags cf) const 403 { 404 DebugAssert(cf.valid_host_t); 405 return Reg32(cf.host_t); 406 } 407 408 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegLO(CompileFlags cf) const 409 { 410 DebugAssert(cf.valid_host_lo); 411 return Reg32(cf.host_lo); 412 } 413 414 Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegHI(CompileFlags cf) const 415 { 416 DebugAssert(cf.valid_host_hi); 417 return Reg32(cf.host_hi); 418 } 419 420 Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToD(CompileFlags cf) 421 { 422 DebugAssert(cf.valid_host_d); 423 DebugAssert(!cf.valid_host_t || cf.host_t != cf.host_d); 424 425 const Reg32 rd = CFGetRegD(cf); 426 MoveSToReg(rd, cf); 427 428 return rd; 429 } 430 431 Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToT(CompileFlags cf) 432 { 433 DebugAssert(cf.valid_host_t); 434 435 const Reg32 rt = CFGetRegT(cf); 436 if (cf.valid_host_s) 437 { 438 const Reg32 rs = CFGetRegS(cf); 439 if (rt != rs) 440 cg->mov(rt, rs); 441 } 442 else if (cf.const_s) 443 { 444 if (const u32 cv = GetConstantRegU32(cf.MipsS()); cv != 0) 445 cg->mov(rt, cv); 446 else 447 cg->xor_(rt, rt); 448 } 449 else 450 { 451 cg->mov(rt, MipsPtr(cf.MipsS())); 452 } 453 454 return rt; 455 } 456 457 Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveTToD(CompileFlags cf) 458 { 459 DebugAssert(cf.valid_host_d); 460 DebugAssert(!cf.valid_host_s || cf.host_s != cf.host_d); 461 462 const Reg32 rd = CFGetRegD(cf); 463 MoveTToReg(rd, cf); 464 return rd; 465 } 466 467 void CPU::NewRec::X64Compiler::MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf) 468 { 469 if (cf.valid_host_s) 470 { 471 if (cf.host_s != static_cast<u32>(dst.getIdx())) 472 cg->mov(dst, Reg32(cf.host_s)); 473 } 474 else if (cf.const_s) 475 { 476 const u32 cv = GetConstantRegU32(cf.MipsS()); 477 if (cv == 0) 478 cg->xor_(dst, dst); 479 else 480 cg->mov(dst, cv); 481 } 482 else 483 { 484 cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_s])]); 485 } 486 } 487 488 void CPU::NewRec::X64Compiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf) 489 { 490 if (cf.valid_host_t) 491 { 492 if (cf.host_t != static_cast<u32>(dst.getIdx())) 493 cg->mov(dst, Reg32(cf.host_t)); 494 } 495 else if (cf.const_t) 496 { 497 const u32 cv = GetConstantRegU32(cf.MipsT()); 498 if (cv == 0) 499 cg->xor_(dst, dst); 500 else 501 cg->mov(dst, cv); 502 } 503 else 504 { 505 cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_t])]); 506 } 507 } 508 509 void CPU::NewRec::X64Compiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg) 510 { 511 DebugAssert(reg < Reg::count); 512 if (const std::optional<u32> hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg)) 513 cg->mov(dst, Reg32(hreg.value())); 514 else if (HasConstantReg(reg)) 515 cg->mov(dst, GetConstantRegU32(reg)); 516 else 517 cg->mov(dst, MipsPtr(reg)); 518 } 519 520 void CPU::NewRec::X64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, 521 Reg arg2reg /* = Reg::count */, 522 Reg arg3reg /* = Reg::count */) 523 { 524 DebugAssert(g_settings.gpu_pgxp_enable); 525 526 Flush(FLUSH_FOR_C_CALL); 527 528 if (arg2reg != Reg::count) 529 MoveMIPSRegToReg(RWARG2, arg2reg); 530 if (arg3reg != Reg::count) 531 MoveMIPSRegToReg(RWARG3, arg3reg); 532 533 cg->mov(RWARG1, arg1val); 534 cg->call(func); 535 } 536 537 void CPU::NewRec::X64Compiler::Flush(u32 flags) 538 { 539 Compiler::Flush(flags); 540 541 if (flags & FLUSH_PC && m_dirty_pc) 542 { 543 cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc); 544 m_dirty_pc = false; 545 } 546 547 if (flags & FLUSH_INSTRUCTION_BITS) 548 { 549 cg->mov(cg->dword[PTR(&g_state.current_instruction.bits)], inst->bits); 550 cg->mov(cg->dword[PTR(&g_state.current_instruction_pc)], m_current_instruction_pc); 551 cg->mov(cg->byte[PTR(&g_state.current_instruction_in_branch_delay_slot)], m_current_instruction_branch_delay_slot); 552 } 553 554 if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty) 555 { 556 // This sucks :( 557 // TODO: make it a function? 558 cg->movzx(RWARG1, cg->byte[PTR(&g_state.load_delay_reg)]); 559 cg->mov(RWARG2, cg->dword[PTR(&g_state.load_delay_value)]); 560 cg->mov(cg->dword[PTR(&g_state.regs.r[0]) + RXARG1 * 4], RWARG2); 561 cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast<u8>(Reg::count)); 562 m_load_delay_dirty = false; 563 } 564 565 if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count) 566 { 567 if (m_load_delay_value_register != NUM_HOST_REGS) 568 FreeHostReg(m_load_delay_value_register); 569 570 cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast<u8>(m_load_delay_register)); 571 m_load_delay_register = Reg::count; 572 m_load_delay_dirty = true; 573 } 574 575 if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle) 576 { 577 // May as well flush cycles while we're here. 578 // GTE spanning blocks is very rare, we _could_ disable this for speed. 579 cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); 580 cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_completion_tick)]); 581 if (m_cycles > 0) 582 { 583 (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles); 584 m_cycles = 0; 585 } 586 cg->cmp(RWARG2, RWARG1); 587 cg->cmova(RWARG1, RWARG2); 588 cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); 589 m_dirty_gte_done_cycle = false; 590 } 591 592 if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles) 593 { 594 cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); 595 596 // update cycles at the same time 597 if (flags & FLUSH_CYCLES && m_cycles > 0) 598 { 599 (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles); 600 cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); 601 m_gte_done_cycle -= m_cycles; 602 m_cycles = 0; 603 } 604 605 (m_gte_done_cycle == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_gte_done_cycle); 606 cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG1); 607 m_gte_done_cycle = 0; 608 m_dirty_gte_done_cycle = true; 609 } 610 611 if (flags & FLUSH_CYCLES && m_cycles > 0) 612 { 613 (m_cycles == 1) ? cg->inc(cg->dword[PTR(&g_state.pending_ticks)]) : 614 cg->add(cg->dword[PTR(&g_state.pending_ticks)], m_cycles); 615 m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0); 616 m_cycles = 0; 617 } 618 } 619 620 void CPU::NewRec::X64Compiler::Compile_Fallback() 621 { 622 WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", iinfo->pc, inst->bits); 623 624 Flush(FLUSH_FOR_INTERPRETER); 625 626 cg->call(&CPU::Recompiler::Thunks::InterpretInstruction); 627 628 // TODO: make me less garbage 629 // TODO: this is wrong, it flushes the load delay on the same cycle when we return. 630 // but nothing should be going through here.. 631 Label no_load_delay; 632 cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]); 633 cg->cmp(RWARG1, static_cast<u8>(Reg::count)); 634 cg->je(no_load_delay, CodeGenerator::T_SHORT); 635 cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]); 636 cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1); 637 cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2); 638 cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast<u32>(Reg::count)); 639 cg->L(no_load_delay); 640 641 m_load_delay_dirty = EMULATE_LOAD_DELAYS; 642 } 643 644 void CPU::NewRec::X64Compiler::CheckBranchTarget(const Xbyak::Reg32& pcreg) 645 { 646 if (!g_settings.cpu_recompiler_memory_exceptions) 647 return; 648 649 cg->test(pcreg, 0x3); 650 SwitchToFarCode(true, &CodeGenerator::jnz); 651 652 BackupHostState(); 653 EndBlockWithException(Exception::AdEL); 654 655 RestoreHostState(); 656 SwitchToNearCode(false); 657 } 658 659 void CPU::NewRec::X64Compiler::Compile_jr(CompileFlags cf) 660 { 661 if (!cf.valid_host_s) 662 cg->mov(RWARG1, MipsPtr(cf.MipsS())); 663 664 const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; 665 CheckBranchTarget(pcreg); 666 667 cg->mov(cg->dword[PTR(&g_state.pc)], pcreg); 668 669 CompileBranchDelaySlot(false); 670 EndBlock(std::nullopt, true); 671 } 672 673 void CPU::NewRec::X64Compiler::Compile_jalr(CompileFlags cf) 674 { 675 if (!cf.valid_host_s) 676 cg->mov(RWARG1, MipsPtr(cf.MipsS())); 677 678 const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; 679 680 if (MipsD() != Reg::zero) 681 SetConstantReg(MipsD(), GetBranchReturnAddress(cf)); 682 683 CheckBranchTarget(pcreg); 684 cg->mov(cg->dword[PTR(&g_state.pc)], pcreg); 685 686 CompileBranchDelaySlot(false); 687 EndBlock(std::nullopt, true); 688 } 689 690 void CPU::NewRec::X64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond) 691 { 692 const u32 taken_pc = GetConditionalBranchTarget(cf); 693 694 Flush(FLUSH_FOR_BRANCH); 695 696 DebugAssert(cf.valid_host_s); 697 698 // MipsT() here should equal zero for zero branches. 699 DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero); 700 701 // TODO: Swap this back to near once instructions don't blow up 702 constexpr CodeGenerator::LabelType type = CodeGenerator::T_NEAR; 703 Label taken; 704 switch (cond) 705 { 706 case BranchCondition::Equal: 707 case BranchCondition::NotEqual: 708 { 709 // we should always have S, maybe not T 710 // TODO: if it's zero, we can just do test rs, rs 711 if (cf.valid_host_t) 712 cg->cmp(CFGetRegS(cf), CFGetRegT(cf)); 713 else if (cf.const_t) 714 cg->cmp(CFGetRegS(cf), GetConstantRegU32(cf.MipsT())); 715 else 716 cg->cmp(CFGetRegS(cf), MipsPtr(cf.MipsT())); 717 718 (cond == BranchCondition::Equal) ? cg->je(taken, type) : cg->jne(taken, type); 719 } 720 break; 721 722 case BranchCondition::GreaterThanZero: 723 { 724 cg->cmp(CFGetRegS(cf), 0); 725 cg->jg(taken, type); 726 } 727 break; 728 729 case BranchCondition::GreaterEqualZero: 730 { 731 cg->test(CFGetRegS(cf), CFGetRegS(cf)); 732 cg->jns(taken, type); 733 } 734 break; 735 736 case BranchCondition::LessThanZero: 737 { 738 cg->test(CFGetRegS(cf), CFGetRegS(cf)); 739 cg->js(taken, type); 740 } 741 break; 742 743 case BranchCondition::LessEqualZero: 744 { 745 cg->cmp(CFGetRegS(cf), 0); 746 cg->jle(taken, type); 747 } 748 break; 749 } 750 751 BackupHostState(); 752 if (!cf.delay_slot_swapped) 753 CompileBranchDelaySlot(); 754 755 EndBlock(m_compiler_pc, true); 756 757 cg->L(taken); 758 759 RestoreHostState(); 760 if (!cf.delay_slot_swapped) 761 CompileBranchDelaySlot(); 762 763 EndBlock(taken_pc, true); 764 } 765 766 void CPU::NewRec::X64Compiler::Compile_addi(CompileFlags cf) 767 { 768 const Reg32 rt = MoveSToT(cf); 769 if (const u32 imm = inst->i.imm_sext32(); imm != 0) 770 { 771 cg->add(rt, imm); 772 if (g_settings.cpu_recompiler_memory_exceptions) 773 { 774 DebugAssert(cf.valid_host_t); 775 TestOverflow(rt); 776 } 777 } 778 } 779 780 void CPU::NewRec::X64Compiler::Compile_addiu(CompileFlags cf) 781 { 782 const Reg32 rt = MoveSToT(cf); 783 if (const u32 imm = inst->i.imm_sext32(); imm != 0) 784 cg->add(rt, imm); 785 } 786 787 void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf) 788 { 789 Compile_slti(cf, true); 790 } 791 792 void CPU::NewRec::X64Compiler::Compile_sltiu(CompileFlags cf) 793 { 794 Compile_slti(cf, false); 795 } 796 797 void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf, bool sign) 798 { 799 const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; 800 801 // Case where T == S, can't use xor because it changes flags 802 if (!cf.valid_host_t || !cf.valid_host_s || cf.host_t != cf.host_s) 803 cg->xor_(rt, rt); 804 805 if (cf.valid_host_s) 806 cg->cmp(CFGetRegS(cf), inst->i.imm_sext32()); 807 else 808 cg->cmp(MipsPtr(cf.MipsS()), inst->i.imm_sext32()); 809 810 if (cf.valid_host_t && cf.valid_host_s && cf.host_t == cf.host_s) 811 cg->mov(rt, 0); 812 813 sign ? cg->setl(rt.cvt8()) : cg->setb(rt.cvt8()); 814 815 if (!cf.valid_host_t) 816 cg->mov(MipsPtr(cf.MipsT()), rt); 817 } 818 819 void CPU::NewRec::X64Compiler::Compile_andi(CompileFlags cf) 820 { 821 if (const u32 imm = inst->i.imm_zext32(); imm != 0) 822 { 823 const Reg32 rt = MoveSToT(cf); 824 cg->and_(rt, imm); 825 } 826 else 827 { 828 const Reg32 rt = CFGetRegT(cf); 829 cg->xor_(rt, rt); 830 } 831 } 832 833 void CPU::NewRec::X64Compiler::Compile_ori(CompileFlags cf) 834 { 835 const Reg32 rt = MoveSToT(cf); 836 if (const u32 imm = inst->i.imm_zext32(); imm != 0) 837 cg->or_(rt, imm); 838 } 839 840 void CPU::NewRec::X64Compiler::Compile_xori(CompileFlags cf) 841 { 842 const Reg32 rt = MoveSToT(cf); 843 if (const u32 imm = inst->i.imm_zext32(); imm != 0) 844 cg->xor_(rt, imm); 845 } 846 847 void CPU::NewRec::X64Compiler::Compile_sll(CompileFlags cf) 848 { 849 const Reg32 rd = MoveTToD(cf); 850 if (inst->r.shamt > 0) 851 cg->shl(rd, inst->r.shamt); 852 } 853 854 void CPU::NewRec::X64Compiler::Compile_srl(CompileFlags cf) 855 { 856 const Reg32 rd = MoveTToD(cf); 857 if (inst->r.shamt > 0) 858 cg->shr(rd, inst->r.shamt); 859 } 860 861 void CPU::NewRec::X64Compiler::Compile_sra(CompileFlags cf) 862 { 863 const Reg32 rd = MoveTToD(cf); 864 if (inst->r.shamt > 0) 865 cg->sar(rd, inst->r.shamt); 866 } 867 868 void CPU::NewRec::X64Compiler::Compile_variable_shift( 869 CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Reg8&), 870 void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, int)) 871 { 872 const Reg32 rd = CFGetRegD(cf); 873 if (!cf.const_s) 874 { 875 MoveSToReg(cg->ecx, cf); 876 MoveTToReg(rd, cf); 877 (cg->*op)(rd, cg->cl); 878 } 879 else 880 { 881 MoveTToReg(rd, cf); 882 (cg->*op_const)(rd, GetConstantRegU32(cf.MipsS())); 883 } 884 } 885 886 void CPU::NewRec::X64Compiler::Compile_sllv(CompileFlags cf) 887 { 888 Compile_variable_shift(cf, &CodeGenerator::shl, &CodeGenerator::shl); 889 } 890 891 void CPU::NewRec::X64Compiler::Compile_srlv(CompileFlags cf) 892 { 893 Compile_variable_shift(cf, &CodeGenerator::shr, &CodeGenerator::shr); 894 } 895 896 void CPU::NewRec::X64Compiler::Compile_srav(CompileFlags cf) 897 { 898 Compile_variable_shift(cf, &CodeGenerator::sar, &CodeGenerator::sar); 899 } 900 901 void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf, bool sign) 902 { 903 // RAX/RDX shouldn't be allocatable.. 904 DebugAssert(!(m_host_regs[Xbyak::Operand::RAX].flags & HR_USABLE) && 905 !(m_host_regs[Xbyak::Operand::RDX].flags & HR_USABLE)); 906 907 MoveSToReg(cg->eax, cf); 908 if (cf.valid_host_t) 909 { 910 sign ? cg->imul(CFGetRegT(cf)) : cg->mul(CFGetRegT(cf)); 911 } 912 else if (cf.const_t) 913 { 914 cg->mov(cg->edx, GetConstantRegU32(cf.MipsT())); 915 sign ? cg->imul(cg->edx) : cg->mul(cg->edx); 916 } 917 else 918 { 919 sign ? cg->imul(MipsPtr(cf.MipsT())) : cg->mul(MipsPtr(cf.MipsT())); 920 } 921 922 // TODO: skip writeback if it's not needed 923 if (cf.valid_host_lo) 924 cg->mov(CFGetRegLO(cf), cg->eax); 925 else 926 cg->mov(MipsPtr(Reg::lo), cg->eax); 927 if (cf.valid_host_lo) 928 cg->mov(CFGetRegHI(cf), cg->edx); 929 else 930 cg->mov(MipsPtr(Reg::hi), cg->edx); 931 } 932 933 void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf) 934 { 935 Compile_mult(cf, true); 936 } 937 938 void CPU::NewRec::X64Compiler::Compile_multu(CompileFlags cf) 939 { 940 Compile_mult(cf, false); 941 } 942 943 void CPU::NewRec::X64Compiler::Compile_div(CompileFlags cf) 944 { 945 // not supported without registers for now.. 946 DebugAssert(cf.valid_host_lo && cf.valid_host_hi); 947 948 const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx; 949 if (!cf.valid_host_t) 950 MoveTToReg(rt, cf); 951 952 const Reg32 rlo = CFGetRegLO(cf); 953 const Reg32 rhi = CFGetRegHI(cf); 954 955 MoveSToReg(cg->eax, cf); 956 cg->cdq(); 957 958 Label done; 959 Label not_divide_by_zero; 960 cg->test(rt, rt); 961 cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT); 962 cg->test(cg->eax, cg->eax); 963 cg->mov(rhi, cg->eax); // hi = num 964 cg->mov(rlo, 1); 965 cg->mov(cg->eax, static_cast<u32>(-1)); 966 cg->cmovns(rlo, cg->eax); // lo = s >= 0 ? -1 : 1 967 cg->jmp(done, CodeGenerator::T_SHORT); 968 969 cg->L(not_divide_by_zero); 970 Label not_unrepresentable; 971 cg->cmp(cg->eax, 0x80000000u); 972 cg->jne(not_unrepresentable, CodeGenerator::T_SHORT); 973 cg->cmp(rt, static_cast<u32>(-1)); 974 cg->jne(not_unrepresentable, CodeGenerator::T_SHORT); 975 976 cg->mov(rlo, 0x80000000u); 977 cg->xor_(rhi, rhi); 978 cg->jmp(done, CodeGenerator::T_SHORT); 979 980 cg->L(not_unrepresentable); 981 982 cg->idiv(rt); 983 cg->mov(rlo, cg->eax); 984 cg->mov(rhi, cg->edx); 985 986 cg->L(done); 987 } 988 989 void CPU::NewRec::X64Compiler::Compile_divu(CompileFlags cf) 990 { 991 // not supported without registers for now.. 992 DebugAssert(cf.valid_host_lo && cf.valid_host_hi); 993 994 const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx; 995 if (!cf.valid_host_t) 996 MoveTToReg(rt, cf); 997 998 const Reg32 rlo = CFGetRegLO(cf); 999 const Reg32 rhi = CFGetRegHI(cf); 1000 1001 MoveSToReg(cg->eax, cf); 1002 cg->xor_(cg->edx, cg->edx); 1003 1004 Label done; 1005 Label not_divide_by_zero; 1006 cg->test(rt, rt); 1007 cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT); 1008 cg->mov(rlo, static_cast<u32>(-1)); 1009 cg->mov(rhi, cg->eax); 1010 cg->jmp(done, CodeGenerator::T_SHORT); 1011 1012 cg->L(not_divide_by_zero); 1013 cg->div(rt); 1014 cg->mov(rlo, cg->eax); 1015 cg->mov(rhi, cg->edx); 1016 1017 cg->L(done); 1018 } 1019 1020 void CPU::NewRec::X64Compiler::TestOverflow(const Xbyak::Reg32& result) 1021 { 1022 SwitchToFarCode(true, &Xbyak::CodeGenerator::jo); 1023 1024 BackupHostState(); 1025 1026 // toss the result 1027 ClearHostReg(result.getIdx()); 1028 1029 EndBlockWithException(Exception::Ov); 1030 1031 RestoreHostState(); 1032 1033 SwitchToNearCode(false); 1034 } 1035 1036 void CPU::NewRec::X64Compiler::Compile_dst_op( 1037 CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&), 1038 void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, u32), bool commutative, bool overflow) 1039 { 1040 if (cf.valid_host_s && cf.valid_host_t) 1041 { 1042 if (cf.host_d == cf.host_s) 1043 { 1044 (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); 1045 } 1046 else if (cf.host_d == cf.host_t) 1047 { 1048 if (commutative) 1049 { 1050 (cg->*op)(CFGetRegD(cf), CFGetRegS(cf)); 1051 } 1052 else 1053 { 1054 cg->mov(RWARG1, CFGetRegT(cf)); 1055 cg->mov(CFGetRegD(cf), CFGetRegS(cf)); 1056 (cg->*op)(CFGetRegD(cf), RWARG1); 1057 } 1058 } 1059 else 1060 { 1061 cg->mov(CFGetRegD(cf), CFGetRegS(cf)); 1062 (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); 1063 } 1064 } 1065 else if (commutative && (cf.const_s || cf.const_t)) 1066 { 1067 const Reg32 rd = CFGetRegD(cf); 1068 (cf.const_s) ? MoveTToReg(rd, cf) : MoveSToReg(rd, cf); 1069 if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0) 1070 (cg->*op_const)(CFGetRegD(cf), cv); 1071 else 1072 overflow = false; 1073 } 1074 else if (cf.const_s) 1075 { 1076 // need to backup T? 1077 if (cf.valid_host_d && cf.valid_host_t && cf.host_d == cf.host_t) 1078 { 1079 cg->mov(RWARG1, CFGetRegT(cf)); 1080 MoveSToReg(CFGetRegD(cf), cf); 1081 (cg->*op)(CFGetRegD(cf), RWARG1); 1082 } 1083 else 1084 { 1085 MoveSToReg(CFGetRegD(cf), cf); 1086 (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); 1087 } 1088 } 1089 else if (cf.const_t) 1090 { 1091 MoveSToReg(CFGetRegD(cf), cf); 1092 if (const u32 cv = GetConstantRegU32(cf.MipsT()); cv != 0) 1093 (cg->*op_const)(CFGetRegD(cf), cv); 1094 else 1095 overflow = false; 1096 } 1097 else if (cf.valid_host_s) 1098 { 1099 if (cf.host_d != cf.host_s) 1100 cg->mov(CFGetRegD(cf), CFGetRegS(cf)); 1101 (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT())); 1102 } 1103 else if (cf.valid_host_t) 1104 { 1105 if (cf.host_d != cf.host_t) 1106 cg->mov(CFGetRegD(cf), CFGetRegT(cf)); 1107 (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsS())); 1108 } 1109 else 1110 { 1111 cg->mov(CFGetRegD(cf), MipsPtr(cf.MipsS())); 1112 (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT())); 1113 } 1114 1115 if (overflow) 1116 { 1117 DebugAssert(cf.valid_host_d); 1118 TestOverflow(CFGetRegD(cf)); 1119 } 1120 } 1121 1122 void CPU::NewRec::X64Compiler::Compile_add(CompileFlags cf) 1123 { 1124 Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, g_settings.cpu_recompiler_memory_exceptions); 1125 } 1126 1127 void CPU::NewRec::X64Compiler::Compile_addu(CompileFlags cf) 1128 { 1129 Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, false); 1130 } 1131 1132 void CPU::NewRec::X64Compiler::Compile_sub(CompileFlags cf) 1133 { 1134 Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, g_settings.cpu_recompiler_memory_exceptions); 1135 } 1136 1137 void CPU::NewRec::X64Compiler::Compile_subu(CompileFlags cf) 1138 { 1139 Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, false); 1140 } 1141 1142 void CPU::NewRec::X64Compiler::Compile_and(CompileFlags cf) 1143 { 1144 // special cases - and with self -> self, and with 0 -> 0 1145 const Reg32 regd = CFGetRegD(cf); 1146 if (cf.MipsS() == cf.MipsT()) 1147 { 1148 MoveSToReg(regd, cf); 1149 return; 1150 } 1151 else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) 1152 { 1153 cg->xor_(regd, regd); 1154 return; 1155 } 1156 1157 Compile_dst_op(cf, &CodeGenerator::and_, &CodeGenerator::and_, true, false); 1158 } 1159 1160 void CPU::NewRec::X64Compiler::Compile_or(CompileFlags cf) 1161 { 1162 // or/nor with 0 -> no effect 1163 const Reg32 regd = CFGetRegD(cf); 1164 if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT()) 1165 { 1166 cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); 1167 return; 1168 } 1169 1170 Compile_dst_op(cf, &CodeGenerator::or_, &CodeGenerator::or_, true, false); 1171 } 1172 1173 void CPU::NewRec::X64Compiler::Compile_xor(CompileFlags cf) 1174 { 1175 const Reg32 regd = CFGetRegD(cf); 1176 if (cf.MipsS() == cf.MipsT()) 1177 { 1178 // xor with self -> zero 1179 cg->xor_(regd, regd); 1180 return; 1181 } 1182 else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) 1183 { 1184 // xor with zero -> no effect 1185 cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); 1186 return; 1187 } 1188 1189 Compile_dst_op(cf, &CodeGenerator::xor_, &CodeGenerator::xor_, true, false); 1190 } 1191 1192 void CPU::NewRec::X64Compiler::Compile_nor(CompileFlags cf) 1193 { 1194 Compile_or(cf); 1195 cg->not_(CFGetRegD(cf)); 1196 } 1197 1198 void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf) 1199 { 1200 Compile_slt(cf, true); 1201 } 1202 1203 void CPU::NewRec::X64Compiler::Compile_sltu(CompileFlags cf) 1204 { 1205 Compile_slt(cf, false); 1206 } 1207 1208 void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf, bool sign) 1209 { 1210 const Reg32 rd = CFGetRegD(cf); 1211 const Reg32 rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; 1212 const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; 1213 if (!cf.valid_host_s) 1214 MoveSToReg(rs, cf); 1215 1216 // Case where D == S, can't use xor because it changes flags 1217 // TODO: swap and reverse op for constants 1218 if (rd != rs && rd != rt) 1219 cg->xor_(rd, rd); 1220 1221 if (cf.valid_host_t) 1222 cg->cmp(rs, CFGetRegT(cf)); 1223 else if (cf.const_t) 1224 cg->cmp(rs, GetConstantRegU32(cf.MipsT())); 1225 else 1226 cg->cmp(rs, MipsPtr(cf.MipsT())); 1227 1228 if (rd == rs || rd == rt) 1229 cg->mov(rd, 0); 1230 1231 sign ? cg->setl(rd.cvt8()) : cg->setb(rd.cvt8()); 1232 } 1233 1234 Xbyak::Reg32 1235 CPU::NewRec::X64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf, 1236 const std::optional<VirtualMemoryAddress>& address, 1237 const std::optional<const Xbyak::Reg32>& reg /* = std::nullopt */) 1238 { 1239 const u32 imm = inst->i.imm_sext32(); 1240 if (cf.valid_host_s && imm == 0 && !reg.has_value()) 1241 return CFGetRegS(cf); 1242 1243 const Reg32 dst = reg.has_value() ? reg.value() : RWARG1; 1244 if (address.has_value()) 1245 { 1246 cg->mov(dst, address.value()); 1247 } 1248 else 1249 { 1250 if (cf.valid_host_s) 1251 { 1252 if (const Reg32 src = CFGetRegS(cf); src != dst) 1253 cg->mov(dst, CFGetRegS(cf)); 1254 } 1255 else 1256 { 1257 cg->mov(dst, MipsPtr(cf.MipsS())); 1258 } 1259 1260 if (imm != 0) 1261 cg->add(dst, inst->i.imm_sext32()); 1262 } 1263 1264 return dst; 1265 } 1266 1267 template<typename RegAllocFn> 1268 Xbyak::Reg32 CPU::NewRec::X64Compiler::GenerateLoad(const Xbyak::Reg32& addr_reg, MemoryAccessSize size, bool sign, 1269 bool use_fastmem, const RegAllocFn& dst_reg_alloc) 1270 { 1271 if (use_fastmem) 1272 { 1273 m_cycles += Bus::RAM_READ_TICKS; 1274 1275 const Reg32 dst = dst_reg_alloc(); 1276 1277 if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) 1278 { 1279 DebugAssert(addr_reg != RWARG3); 1280 cg->mov(RWARG3, addr_reg.cvt32()); 1281 cg->shr(RXARG3, Bus::FASTMEM_LUT_PAGE_SHIFT); 1282 cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]); 1283 } 1284 1285 const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE; 1286 u8* start = cg->getCurr<u8*>(); 1287 switch (size) 1288 { 1289 case MemoryAccessSize::Byte: 1290 { 1291 sign ? cg->movsx(dst, cg->byte[membase + addr_reg.cvt64()]) : 1292 cg->movzx(dst, cg->byte[membase + addr_reg.cvt64()]); 1293 } 1294 break; 1295 1296 case MemoryAccessSize::HalfWord: 1297 { 1298 sign ? cg->movsx(dst, cg->word[membase + addr_reg.cvt64()]) : 1299 cg->movzx(dst, cg->word[membase + addr_reg.cvt64()]); 1300 } 1301 break; 1302 1303 case MemoryAccessSize::Word: 1304 { 1305 cg->mov(dst, cg->word[membase + addr_reg.cvt64()]); 1306 } 1307 break; 1308 } 1309 1310 u8* end = cg->getCurr<u8*>(); 1311 while ((end - start) < BACKPATCH_JMP_SIZE) 1312 { 1313 cg->nop(); 1314 end = cg->getCurr<u8*>(); 1315 } 1316 1317 AddLoadStoreInfo(start, static_cast<u32>(end - start), static_cast<u32>(addr_reg.getIdx()), 1318 static_cast<u32>(dst.getIdx()), size, sign, true); 1319 return dst; 1320 } 1321 1322 if (addr_reg != RWARG1) 1323 cg->mov(RWARG1, addr_reg); 1324 1325 const bool checked = g_settings.cpu_recompiler_memory_exceptions; 1326 switch (size) 1327 { 1328 case MemoryAccessSize::Byte: 1329 { 1330 cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryByte) : 1331 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte)); 1332 } 1333 break; 1334 case MemoryAccessSize::HalfWord: 1335 { 1336 cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryHalfWord) : 1337 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord)); 1338 } 1339 break; 1340 case MemoryAccessSize::Word: 1341 { 1342 cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryWord) : 1343 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord)); 1344 } 1345 break; 1346 } 1347 1348 // TODO: turn this into an asm function instead 1349 if (checked) 1350 { 1351 cg->test(RXRET, RXRET); 1352 1353 BackupHostState(); 1354 SwitchToFarCode(true, &CodeGenerator::js); 1355 1356 // flush regs, but not pc, it's going to get overwritten 1357 // flush cycles because of the GTE instruction stuff... 1358 Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); 1359 1360 // cause_bits = (-result << 2) | BD | cop_n 1361 cg->mov(RWARG1, RWRET); 1362 cg->neg(RWARG1); 1363 cg->shl(RWARG1, 2); 1364 cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException( 1365 static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); 1366 cg->mov(RWARG2, m_current_instruction_pc); 1367 cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)); 1368 m_dirty_pc = false; 1369 EndAndLinkBlock(std::nullopt, true, false); 1370 1371 SwitchToNearCode(false); 1372 RestoreHostState(); 1373 } 1374 1375 const Xbyak::Reg32 dst_reg = dst_reg_alloc(); 1376 switch (size) 1377 { 1378 case MemoryAccessSize::Byte: 1379 { 1380 sign ? cg->movsx(dst_reg, RWRET.cvt8()) : cg->movzx(dst_reg, RWRET.cvt8()); 1381 } 1382 break; 1383 case MemoryAccessSize::HalfWord: 1384 { 1385 sign ? cg->movsx(dst_reg, RWRET.cvt16()) : cg->movzx(dst_reg, RWRET.cvt16()); 1386 } 1387 break; 1388 case MemoryAccessSize::Word: 1389 { 1390 if (dst_reg != RWRET) 1391 cg->mov(dst_reg, RWRET); 1392 } 1393 break; 1394 } 1395 1396 return dst_reg; 1397 } 1398 1399 void CPU::NewRec::X64Compiler::GenerateStore(const Xbyak::Reg32& addr_reg, const Xbyak::Reg32& value_reg, 1400 MemoryAccessSize size, bool use_fastmem) 1401 { 1402 if (use_fastmem) 1403 { 1404 if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) 1405 { 1406 DebugAssert(addr_reg != RWARG3 && value_reg != RWARG3); 1407 cg->mov(RWARG3, addr_reg.cvt32()); 1408 cg->shr(RXARG3, Bus::FASTMEM_LUT_PAGE_SHIFT); 1409 cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]); 1410 } 1411 1412 const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE; 1413 u8* start = cg->getCurr<u8*>(); 1414 switch (size) 1415 { 1416 case MemoryAccessSize::Byte: 1417 cg->mov(cg->byte[membase + addr_reg.cvt64()], value_reg.cvt8()); 1418 break; 1419 1420 case MemoryAccessSize::HalfWord: 1421 cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt16()); 1422 break; 1423 1424 case MemoryAccessSize::Word: 1425 cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt32()); 1426 break; 1427 } 1428 1429 u8* end = cg->getCurr<u8*>(); 1430 while ((end - start) < BACKPATCH_JMP_SIZE) 1431 { 1432 cg->nop(); 1433 end = cg->getCurr<u8*>(); 1434 } 1435 1436 AddLoadStoreInfo(start, static_cast<u32>(end - start), static_cast<u32>(addr_reg.getIdx()), 1437 static_cast<u32>(value_reg.getIdx()), size, false, false); 1438 return; 1439 } 1440 1441 if (addr_reg != RWARG1) 1442 cg->mov(RWARG1, addr_reg); 1443 if (value_reg != RWARG2) 1444 cg->mov(RWARG2, value_reg); 1445 1446 const bool checked = g_settings.cpu_recompiler_memory_exceptions; 1447 switch (size) 1448 { 1449 case MemoryAccessSize::Byte: 1450 { 1451 cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryByte) : 1452 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte)); 1453 } 1454 break; 1455 case MemoryAccessSize::HalfWord: 1456 { 1457 cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryHalfWord) : 1458 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); 1459 } 1460 break; 1461 case MemoryAccessSize::Word: 1462 { 1463 cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryWord) : 1464 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord)); 1465 } 1466 break; 1467 } 1468 1469 // TODO: turn this into an asm function instead 1470 if (checked) 1471 { 1472 cg->test(RWRET, RWRET); 1473 1474 BackupHostState(); 1475 SwitchToFarCode(true, &CodeGenerator::jnz); 1476 1477 // flush regs, but not pc, it's going to get overwritten 1478 // flush cycles because of the GTE instruction stuff... 1479 Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); 1480 1481 // cause_bits = (result << 2) | BD | cop_n 1482 cg->mov(RWARG1, RWRET); 1483 cg->shl(RWARG1, 2); 1484 cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException( 1485 static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); 1486 cg->mov(RWARG2, m_current_instruction_pc); 1487 cg->call(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException))); 1488 m_dirty_pc = false; 1489 EndAndLinkBlock(std::nullopt, true, false); 1490 1491 SwitchToNearCode(false); 1492 RestoreHostState(); 1493 } 1494 } 1495 1496 void CPU::NewRec::X64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, 1497 const std::optional<VirtualMemoryAddress>& address) 1498 { 1499 const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ? 1500 std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : 1501 std::optional<Reg32>(); 1502 FlushForLoadStore(address, false, use_fastmem); 1503 const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); 1504 1505 const Reg32 data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() { 1506 if (cf.MipsT() == Reg::zero) 1507 return RWRET; 1508 1509 return Reg32(AllocateHostReg(GetFlagsForNewLoadDelayedReg(), 1510 EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT())); 1511 }); 1512 1513 if (g_settings.gpu_pgxp_enable) 1514 { 1515 Flush(FLUSH_FOR_C_CALL); 1516 1517 cg->mov(RWARG1, inst->bits); 1518 cg->mov(RWARG2, addr); 1519 cg->mov(RWARG3, data); 1520 cg->call(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]); 1521 FreeHostReg(addr_reg.value().getIdx()); 1522 } 1523 } 1524 1525 void CPU::NewRec::X64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, 1526 const std::optional<VirtualMemoryAddress>& address) 1527 { 1528 DebugAssert(size == MemoryAccessSize::Word && !sign); 1529 1530 const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); 1531 FlushForLoadStore(address, false, use_fastmem); 1532 1533 // TODO: if address is constant, this can be simplified.. 1534 1535 // If we're coming from another block, just flush the load delay and hope for the best.. 1536 if (m_load_delay_dirty) 1537 UpdateLoadDelay(); 1538 1539 // We'd need to be careful here if we weren't overwriting it.. 1540 ComputeLoadStoreAddressArg(cf, address, addr); 1541 cg->mov(RWARG1, addr); 1542 cg->and_(RWARG1, ~0x3u); 1543 GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); 1544 1545 if (inst->r.rt == Reg::zero) 1546 { 1547 FreeHostReg(addr.getIdx()); 1548 return; 1549 } 1550 1551 // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is 1552 // never written back. NOTE: can't trust T in cf because of the flush 1553 const Reg rt = inst->r.rt; 1554 Reg32 value; 1555 if (m_load_delay_register == rt) 1556 { 1557 const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ? 1558 AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) : 1559 m_load_delay_value_register; 1560 RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt); 1561 value = Reg32(existing_ld_rt); 1562 } 1563 else 1564 { 1565 if constexpr (EMULATE_LOAD_DELAYS) 1566 { 1567 value = Reg32(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt)); 1568 if (HasConstantReg(rt)) 1569 cg->mov(value, GetConstantRegU32(rt)); 1570 else if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) 1571 cg->mov(value, Reg32(rtreg.value())); 1572 else 1573 cg->mov(value, MipsPtr(rt)); 1574 } 1575 else 1576 { 1577 value = Reg32(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt)); 1578 } 1579 } 1580 1581 DebugAssert(value != cg->ecx); 1582 cg->mov(cg->ecx, addr); 1583 cg->and_(cg->ecx, 3); 1584 cg->shl(cg->ecx, 3); // *8 1585 1586 // TODO for other arch: reverse subtract 1587 DebugAssert(RWARG2 != cg->ecx); 1588 cg->mov(RWARG2, 24); 1589 cg->sub(RWARG2, cg->ecx); 1590 1591 if (inst->op == InstructionOp::lwl) 1592 { 1593 // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; 1594 // new_value = (value & mask) | (RWRET << (24 - shift)); 1595 cg->mov(RWARG3, 0xFFFFFFu); 1596 cg->shr(RWARG3, cg->cl); 1597 cg->and_(value, RWARG3); 1598 cg->mov(cg->ecx, RWARG2); 1599 cg->shl(RWRET, cg->cl); 1600 cg->or_(value, RWRET); 1601 } 1602 else 1603 { 1604 // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); 1605 // new_value = (value & mask) | (RWRET >> shift); 1606 cg->shr(RWRET, cg->cl); 1607 cg->mov(RWARG3, 0xFFFFFF00u); 1608 cg->mov(cg->ecx, RWARG2); 1609 cg->shl(RWARG3, cg->cl); 1610 cg->and_(value, RWARG3); 1611 cg->or_(value, RWRET); 1612 } 1613 1614 FreeHostReg(addr.getIdx()); 1615 1616 if (g_settings.gpu_pgxp_enable) 1617 { 1618 Flush(FLUSH_FOR_C_CALL); 1619 1620 DebugAssert(value != RWARG3); 1621 cg->mov(RWARG3, value); 1622 cg->mov(RWARG2, addr); 1623 cg->and_(RWARG2, ~0x3u); 1624 cg->mov(RWARG1, inst->bits); 1625 cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LW)); 1626 } 1627 } 1628 1629 void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, 1630 const std::optional<VirtualMemoryAddress>& address) 1631 { 1632 const u32 index = static_cast<u32>(inst->r.rt.GetValue()); 1633 const auto [ptr, action] = GetGTERegisterPointer(index, true); 1634 const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ? 1635 std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : 1636 std::optional<Reg32>(); 1637 FlushForLoadStore(address, false, use_fastmem); 1638 const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); 1639 const Reg32 value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() { 1640 return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ? 1641 Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : 1642 RWRET; 1643 }); 1644 1645 switch (action) 1646 { 1647 case GTERegisterAccessAction::Ignore: 1648 { 1649 break; 1650 } 1651 1652 case GTERegisterAccessAction::Direct: 1653 { 1654 cg->mov(cg->dword[PTR(ptr)], value); 1655 break; 1656 } 1657 1658 case GTERegisterAccessAction::SignExtend16: 1659 { 1660 cg->movsx(RWARG3, value.cvt16()); 1661 cg->mov(cg->dword[PTR(ptr)], RWARG3); 1662 break; 1663 } 1664 1665 case GTERegisterAccessAction::ZeroExtend16: 1666 { 1667 cg->movzx(RWARG3, value.cvt16()); 1668 cg->mov(cg->dword[PTR(ptr)], RWARG3); 1669 break; 1670 } 1671 1672 case GTERegisterAccessAction::CallHandler: 1673 { 1674 Flush(FLUSH_FOR_C_CALL); 1675 cg->mov(RWARG2, value); 1676 cg->mov(RWARG1, index); 1677 cg->call(>E::WriteRegister); 1678 break; 1679 } 1680 1681 case GTERegisterAccessAction::PushFIFO: 1682 { 1683 // SXY0 <- SXY1 1684 // SXY1 <- SXY2 1685 // SXY2 <- SXYP 1686 DebugAssert(value != RWARG1 && value != RWARG2); 1687 cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]); 1688 cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]); 1689 cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1); 1690 cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2); 1691 cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], value); 1692 break; 1693 } 1694 1695 default: 1696 { 1697 Panic("Unknown action"); 1698 return; 1699 } 1700 } 1701 1702 if (g_settings.gpu_pgxp_enable) 1703 { 1704 Flush(FLUSH_FOR_C_CALL); 1705 cg->mov(RWARG3, value); 1706 if (value != RWRET) 1707 FreeHostReg(value.getIdx()); 1708 cg->mov(RWARG2, addr); 1709 FreeHostReg(addr_reg.value().getIdx()); 1710 cg->mov(RWARG1, inst->bits); 1711 cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LWC2)); 1712 } 1713 } 1714 1715 void CPU::NewRec::X64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, 1716 const std::optional<VirtualMemoryAddress>& address) 1717 { 1718 const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ? 1719 std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : 1720 std::optional<Reg32>(); 1721 FlushForLoadStore(address, true, use_fastmem); 1722 const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); 1723 const Reg32 data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2; 1724 if (!cf.valid_host_t) 1725 MoveTToReg(RWARG2, cf); 1726 1727 GenerateStore(addr, data, size, use_fastmem); 1728 1729 if (g_settings.gpu_pgxp_enable) 1730 { 1731 Flush(FLUSH_FOR_C_CALL); 1732 MoveMIPSRegToReg(RWARG3, cf.MipsT()); 1733 cg->mov(RWARG2, addr); 1734 cg->mov(RWARG1, inst->bits); 1735 cg->call(s_pgxp_mem_store_functions[static_cast<u32>(size)]); 1736 FreeHostReg(addr_reg.value().getIdx()); 1737 } 1738 } 1739 1740 void CPU::NewRec::X64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, 1741 const std::optional<VirtualMemoryAddress>& address) 1742 { 1743 DebugAssert(size == MemoryAccessSize::Word && !sign); 1744 1745 // TODO: this can take over rt's value if it's no longer needed 1746 // NOTE: can't trust T in cf because of the alloc 1747 const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); 1748 const Reg32 value = g_settings.gpu_pgxp_enable ? Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2; 1749 if (g_settings.gpu_pgxp_enable) 1750 MoveMIPSRegToReg(value, inst->r.rt); 1751 1752 FlushForLoadStore(address, true, use_fastmem); 1753 1754 // TODO: if address is constant, this can be simplified.. 1755 // We'd need to be careful here if we weren't overwriting it.. 1756 ComputeLoadStoreAddressArg(cf, address, addr); 1757 cg->mov(RWARG1, addr); 1758 cg->and_(RWARG1, ~0x3u); 1759 GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); 1760 1761 DebugAssert(value != cg->ecx); 1762 cg->mov(cg->ecx, addr); 1763 cg->and_(cg->ecx, 3); 1764 cg->shl(cg->ecx, 3); // *8 1765 cg->and_(addr, ~0x3u); 1766 1767 // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush. 1768 if (!g_settings.gpu_pgxp_enable) 1769 MoveMIPSRegToReg(value, inst->r.rt); 1770 1771 if (inst->op == InstructionOp::swl) 1772 { 1773 // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; 1774 // new_value = (RWRET & mem_mask) | (value >> (24 - shift)); 1775 cg->mov(RWARG3, 0xFFFFFF00u); 1776 cg->shl(RWARG3, cg->cl); 1777 cg->and_(RWRET, RWARG3); 1778 1779 cg->mov(RWARG3, 24); 1780 cg->sub(RWARG3, cg->ecx); 1781 cg->mov(cg->ecx, RWARG3); 1782 cg->shr(value, cg->cl); 1783 cg->or_(value, RWRET); 1784 } 1785 else 1786 { 1787 // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); 1788 // new_value = (RWRET & mem_mask) | (value << shift); 1789 cg->shl(value, cg->cl); 1790 1791 DebugAssert(RWARG3 != cg->ecx); 1792 cg->mov(RWARG3, 24); 1793 cg->sub(RWARG3, cg->ecx); 1794 cg->mov(cg->ecx, RWARG3); 1795 cg->mov(RWARG3, 0x00FFFFFFu); 1796 cg->shr(RWARG3, cg->cl); 1797 cg->and_(RWRET, RWARG3); 1798 cg->or_(value, RWRET); 1799 } 1800 1801 if (!g_settings.gpu_pgxp_enable) 1802 { 1803 GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); 1804 FreeHostReg(addr.getIdx()); 1805 } 1806 else 1807 { 1808 GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); 1809 1810 Flush(FLUSH_FOR_C_CALL); 1811 cg->mov(RWARG3, value); 1812 FreeHostReg(value.getIdx()); 1813 cg->mov(RWARG2, addr); 1814 FreeHostReg(addr.getIdx()); 1815 cg->mov(RWARG1, inst->bits); 1816 cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SW)); 1817 } 1818 } 1819 1820 void CPU::NewRec::X64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, 1821 const std::optional<VirtualMemoryAddress>& address) 1822 { 1823 const u32 index = static_cast<u32>(inst->r.rt.GetValue()); 1824 const auto [ptr, action] = GetGTERegisterPointer(index, false); 1825 switch (action) 1826 { 1827 case GTERegisterAccessAction::Direct: 1828 { 1829 cg->mov(RWARG2, cg->dword[PTR(ptr)]); 1830 } 1831 break; 1832 1833 case GTERegisterAccessAction::CallHandler: 1834 { 1835 // should already be flushed.. except in fastmem case 1836 Flush(FLUSH_FOR_C_CALL); 1837 cg->mov(RWARG1, index); 1838 cg->call(>E::ReadRegister); 1839 cg->mov(RWARG2, RWRET); 1840 } 1841 break; 1842 1843 default: 1844 { 1845 Panic("Unknown action"); 1846 } 1847 break; 1848 } 1849 1850 // PGXP makes this a giant pain. 1851 if (!g_settings.gpu_pgxp_enable) 1852 { 1853 FlushForLoadStore(address, true, use_fastmem); 1854 const Reg32 addr = ComputeLoadStoreAddressArg(cf, address); 1855 GenerateStore(addr, RWARG2, size, use_fastmem); 1856 return; 1857 } 1858 1859 // TODO: This can be simplified because we don't need to validate in PGXP.. 1860 const Reg32 addr_reg = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); 1861 const Reg32 data_backup = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); 1862 FlushForLoadStore(address, true, use_fastmem); 1863 ComputeLoadStoreAddressArg(cf, address, addr_reg); 1864 cg->mov(data_backup, RWARG2); 1865 GenerateStore(addr_reg, RWARG2, size, use_fastmem); 1866 1867 Flush(FLUSH_FOR_C_CALL); 1868 cg->mov(RWARG3, data_backup); 1869 cg->mov(RWARG2, addr_reg); 1870 cg->mov(RWARG1, inst->bits); 1871 cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SWC2)); 1872 FreeHostReg(addr_reg.getIdx()); 1873 FreeHostReg(data_backup.getIdx()); 1874 } 1875 1876 void CPU::NewRec::X64Compiler::Compile_mtc0(CompileFlags cf) 1877 { 1878 const Cop0Reg reg = static_cast<Cop0Reg>(MipsD()); 1879 const u32* ptr = GetCop0RegPtr(reg); 1880 const u32 mask = GetCop0RegWriteMask(reg); 1881 if (!ptr) 1882 { 1883 Compile_Fallback(); 1884 return; 1885 } 1886 1887 // TODO: const apply mask 1888 const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; 1889 const u32 constant_value = cf.const_t ? GetConstantRegU32(cf.MipsT()) : 0; 1890 if (mask == 0) 1891 { 1892 // if it's a read-only register, ignore 1893 DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg)); 1894 return; 1895 } 1896 1897 // for some registers, we need to test certain bits 1898 const bool needs_bit_test = (reg == Cop0Reg::SR); 1899 const Reg32 changed_bits = RWARG3; 1900 1901 // update value 1902 if (cf.valid_host_t) 1903 { 1904 cg->mov(RWARG1, rt); 1905 cg->mov(RWARG2, cg->dword[PTR(ptr)]); 1906 cg->and_(RWARG1, mask); 1907 if (needs_bit_test) 1908 { 1909 cg->mov(changed_bits, RWARG2); 1910 cg->xor_(changed_bits, RWARG1); 1911 } 1912 cg->and_(RWARG2, ~mask); 1913 cg->or_(RWARG2, RWARG1); 1914 cg->mov(cg->dword[PTR(ptr)], RWARG2); 1915 } 1916 else 1917 { 1918 cg->mov(RWARG2, cg->dword[PTR(ptr)]); 1919 if (needs_bit_test) 1920 { 1921 cg->mov(changed_bits, RWARG2); 1922 cg->xor_(changed_bits, constant_value & mask); 1923 } 1924 cg->and_(RWARG2, ~mask); 1925 cg->or_(RWARG2, constant_value & mask); 1926 cg->mov(cg->dword[PTR(ptr)], RWARG2); 1927 } 1928 1929 if (reg == Cop0Reg::SR) 1930 { 1931 // TODO: replace with register backup 1932 // We could just inline the whole thing.. 1933 Flush(FLUSH_FOR_C_CALL); 1934 1935 cg->test(changed_bits, 1u << 16); 1936 SwitchToFarCode(true, &CodeGenerator::jnz); 1937 cg->mov(cg->dword[cg->rsp], RWARG2); 1938 cg->sub(cg->rsp, STACK_SHADOW_SIZE + 8); 1939 cg->call(&CPU::UpdateMemoryPointers); 1940 cg->add(cg->rsp, STACK_SHADOW_SIZE + 8); 1941 cg->mov(RWARG2, cg->dword[cg->rsp]); 1942 cg->mov(RMEMBASE, cg->qword[PTR(&g_state.fastmem_base)]); 1943 SwitchToNearCode(true); 1944 1945 TestInterrupts(RWARG2); 1946 } 1947 else if (reg == Cop0Reg::CAUSE) 1948 { 1949 cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]); 1950 TestInterrupts(RWARG1); 1951 } 1952 1953 if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions) 1954 { 1955 // TODO: DCIC handling for debug breakpoints 1956 WARNING_LOG("TODO: DCIC handling for debug breakpoints"); 1957 } 1958 } 1959 1960 void CPU::NewRec::X64Compiler::Compile_rfe(CompileFlags cf) 1961 { 1962 // shift mode bits right two, preserving upper bits 1963 static constexpr u32 mode_bits_mask = UINT32_C(0b1111); 1964 cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]); 1965 cg->mov(RWARG2, RWARG1); 1966 cg->shr(RWARG2, 2); 1967 cg->and_(RWARG1, ~mode_bits_mask); 1968 cg->and_(RWARG2, mode_bits_mask); 1969 cg->or_(RWARG1, RWARG2); 1970 cg->mov(cg->dword[PTR(&g_state.cop0_regs.sr.bits)], RWARG1); 1971 1972 TestInterrupts(RWARG1); 1973 } 1974 1975 void CPU::NewRec::X64Compiler::TestInterrupts(const Xbyak::Reg32& sr) 1976 { 1977 // if Iec == 0 then goto no_interrupt 1978 Label no_interrupt; 1979 1980 cg->test(sr, 1); 1981 cg->jz(no_interrupt, CodeGenerator::T_NEAR); 1982 1983 // sr & cause 1984 cg->and_(sr, cg->dword[PTR(&g_state.cop0_regs.cause.bits)]); 1985 1986 // ((sr & cause) & 0xff00) == 0 goto no_interrupt 1987 cg->test(sr, 0xFF00); 1988 1989 SwitchToFarCode(true, &CodeGenerator::jnz); 1990 BackupHostState(); 1991 1992 // Update load delay, this normally happens at the end of an instruction, but we're finishing it early. 1993 UpdateLoadDelay(); 1994 1995 Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL); 1996 1997 // Can't use EndBlockWithException() here, because it'll use the wrong PC. 1998 // Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown. 1999 if (!iinfo->is_last_instruction) 2000 { 2001 cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false, 2002 (inst + 1)->cop.cop_n)); 2003 cg->mov(RWARG2, m_compiler_pc); 2004 cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)); 2005 m_dirty_pc = false; 2006 EndAndLinkBlock(std::nullopt, true, false); 2007 } 2008 else 2009 { 2010 if (m_dirty_pc) 2011 cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc); 2012 m_dirty_pc = false; 2013 cg->mov(cg->dword[PTR(&g_state.downcount)], 0); 2014 EndAndLinkBlock(std::nullopt, false, true); 2015 } 2016 2017 RestoreHostState(); 2018 SwitchToNearCode(false); 2019 2020 cg->L(no_interrupt); 2021 } 2022 2023 void CPU::NewRec::X64Compiler::Compile_mfc2(CompileFlags cf) 2024 { 2025 const u32 index = inst->cop.Cop2Index(); 2026 const Reg rt = inst->r.rt; 2027 2028 const auto [ptr, action] = GetGTERegisterPointer(index, false); 2029 if (action == GTERegisterAccessAction::Ignore) 2030 return; 2031 2032 u32 hreg; 2033 if (action == GTERegisterAccessAction::Direct) 2034 { 2035 hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(), 2036 EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); 2037 cg->mov(Reg32(hreg), cg->dword[PTR(ptr)]); 2038 } 2039 else if (action == GTERegisterAccessAction::CallHandler) 2040 { 2041 Flush(FLUSH_FOR_C_CALL); 2042 cg->mov(RWARG1, index); 2043 cg->call(>E::ReadRegister); 2044 2045 hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(), 2046 EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); 2047 cg->mov(Reg32(hreg), RWRET); 2048 } 2049 else 2050 { 2051 Panic("Unknown action"); 2052 return; 2053 } 2054 2055 if (g_settings.gpu_pgxp_enable) 2056 { 2057 Flush(FLUSH_FOR_C_CALL); 2058 cg->mov(RWARG1, inst->bits); 2059 cg->mov(RWARG2, Reg32(hreg)); 2060 cg->call(reinterpret_cast<const void*>(&PGXP::CPU_MFC2)); 2061 } 2062 } 2063 2064 void CPU::NewRec::X64Compiler::Compile_mtc2(CompileFlags cf) 2065 { 2066 const u32 index = inst->cop.Cop2Index(); 2067 const auto [ptr, action] = GetGTERegisterPointer(index, true); 2068 if (action == GTERegisterAccessAction::Ignore) 2069 return; 2070 2071 if (action == GTERegisterAccessAction::Direct) 2072 { 2073 if (cf.const_t) 2074 { 2075 cg->mov(cg->dword[PTR(ptr)], GetConstantRegU32(cf.MipsT())); 2076 } 2077 else if (cf.valid_host_t) 2078 { 2079 cg->mov(cg->dword[PTR(ptr)], CFGetRegT(cf)); 2080 } 2081 else 2082 { 2083 cg->mov(RWARG1, MipsPtr(cf.MipsT())); 2084 cg->mov(cg->dword[PTR(ptr)], RWARG1); 2085 } 2086 } 2087 else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16) 2088 { 2089 const bool sign = (action == GTERegisterAccessAction::SignExtend16); 2090 if (cf.const_t) 2091 { 2092 const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT())); 2093 cg->mov(cg->dword[PTR(ptr)], sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv)); 2094 } 2095 else if (cf.valid_host_t) 2096 { 2097 sign ? cg->movsx(RWARG1, Reg16(cf.host_t)) : cg->movzx(RWARG1, Reg16(cf.host_t)); 2098 cg->mov(cg->dword[PTR(ptr)], RWARG1); 2099 } 2100 else 2101 { 2102 sign ? cg->movsx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]) : 2103 cg->movzx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]); 2104 cg->mov(cg->dword[PTR(ptr)], RWARG1); 2105 } 2106 } 2107 else if (action == GTERegisterAccessAction::CallHandler) 2108 { 2109 Flush(FLUSH_FOR_C_CALL); 2110 cg->mov(RWARG1, index); 2111 MoveTToReg(RWARG2, cf); 2112 cg->call(>E::WriteRegister); 2113 } 2114 else if (action == GTERegisterAccessAction::PushFIFO) 2115 { 2116 // SXY0 <- SXY1 2117 // SXY1 <- SXY2 2118 // SXY2 <- SXYP 2119 cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]); 2120 cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]); 2121 if (!cf.const_t && !cf.valid_host_t) 2122 cg->mov(RWARG3, MipsPtr(cf.MipsT())); 2123 cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1); 2124 cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2); 2125 if (cf.const_t) 2126 cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], GetConstantRegU32(cf.MipsT())); 2127 else if (cf.valid_host_t) 2128 cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], CFGetRegT(cf)); 2129 else 2130 cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWARG3); 2131 } 2132 else 2133 { 2134 Panic("Unknown action"); 2135 } 2136 } 2137 2138 void CPU::NewRec::X64Compiler::Compile_cop2(CompileFlags cf) 2139 { 2140 TickCount func_ticks; 2141 GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks); 2142 2143 Flush(FLUSH_FOR_C_CALL); 2144 cg->mov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK); 2145 cg->call(reinterpret_cast<const void*>(func)); 2146 2147 AddGTETicks(func_ticks); 2148 } 2149 2150 u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size, 2151 TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask, 2152 u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed, 2153 bool is_load) 2154 { 2155 CodeGenerator acg(thunk_space, thunk_code); 2156 CodeGenerator* cg = &acg; 2157 2158 static constexpr u32 GPR_SIZE = 8; 2159 2160 // save regs 2161 u32 num_gprs = 0; 2162 2163 for (u32 i = 0; i < NUM_HOST_REGS; i++) 2164 { 2165 if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) 2166 num_gprs++; 2167 } 2168 2169 const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE) + STACK_SHADOW_SIZE; 2170 2171 if (stack_size > 0) 2172 { 2173 cg->sub(cg->rsp, stack_size); 2174 2175 u32 stack_offset = STACK_SHADOW_SIZE; 2176 for (u32 i = 0; i < NUM_HOST_REGS; i++) 2177 { 2178 if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) 2179 { 2180 cg->mov(cg->qword[cg->rsp + stack_offset], Reg64(i)); 2181 stack_offset += GPR_SIZE; 2182 } 2183 } 2184 } 2185 2186 if (cycles_to_add != 0) 2187 cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_add); 2188 2189 if (address_register != static_cast<u8>(RWARG1.getIdx())) 2190 cg->mov(RWARG1, Reg32(address_register)); 2191 2192 if (!is_load) 2193 { 2194 if (data_register != static_cast<u8>(RWARG2.getIdx())) 2195 cg->mov(RWARG2, Reg32(data_register)); 2196 } 2197 2198 switch (size) 2199 { 2200 case MemoryAccessSize::Byte: 2201 { 2202 cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte) : 2203 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte)); 2204 } 2205 break; 2206 case MemoryAccessSize::HalfWord: 2207 { 2208 cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) : 2209 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); 2210 } 2211 break; 2212 case MemoryAccessSize::Word: 2213 { 2214 cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord) : 2215 reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord)); 2216 } 2217 break; 2218 } 2219 2220 if (is_load) 2221 { 2222 const Reg32 dst = Reg32(data_register); 2223 switch (size) 2224 { 2225 case MemoryAccessSize::Byte: 2226 { 2227 is_signed ? cg->movsx(dst, RWRET.cvt8()) : cg->movzx(dst, RWRET.cvt8()); 2228 } 2229 break; 2230 case MemoryAccessSize::HalfWord: 2231 { 2232 is_signed ? cg->movsx(dst, RWRET.cvt16()) : cg->movzx(dst, RWRET.cvt16()); 2233 } 2234 break; 2235 case MemoryAccessSize::Word: 2236 { 2237 if (dst != RWRET) 2238 cg->mov(dst, RWRET); 2239 } 2240 break; 2241 } 2242 } 2243 2244 if (cycles_to_remove != 0) 2245 cg->sub(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_remove); 2246 2247 // restore regs 2248 if (stack_size > 0) 2249 { 2250 u32 stack_offset = STACK_SHADOW_SIZE; 2251 for (u32 i = 0; i < NUM_HOST_REGS; i++) 2252 { 2253 if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) 2254 { 2255 cg->mov(Reg64(i), cg->qword[cg->rsp + stack_offset]); 2256 stack_offset += GPR_SIZE; 2257 } 2258 } 2259 2260 cg->add(cg->rsp, stack_size); 2261 } 2262 2263 cg->jmp(static_cast<const u8*>(code_address) + code_size); 2264 2265 // fill the rest of it with nops, if any 2266 DebugAssert(code_size >= BACKPATCH_JMP_SIZE); 2267 if (code_size > BACKPATCH_JMP_SIZE) 2268 std::memset(static_cast<u8*>(code_address) + BACKPATCH_JMP_SIZE, 0x90, code_size - BACKPATCH_JMP_SIZE); 2269 2270 return static_cast<u32>(cg->getSize()); 2271 } 2272 2273 #endif // CPU_ARCH_X64