duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

cpu_pgxp.cpp (44342B)


      1 // SPDX-FileCopyrightText: 2016 iCatButler, 2019-2023 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: GPL-2.0+
      3 //
      4 // This file has been completely rewritten over the years compared to the original PCSXR-PGXP release.
      5 // No original code remains. The original copyright notice is included above for historical purposes.
      6 //
      7 
      8 #include "cpu_pgxp.h"
      9 #include "bus.h"
     10 #include "cpu_core.h"
     11 #include "cpu_disasm.h"
     12 #include "settings.h"
     13 
     14 #include "util/gpu_device.h"
     15 
     16 #include "common/assert.h"
     17 #include "common/log.h"
     18 
     19 #include <climits>
     20 #include <cmath>
     21 
     22 Log_SetChannel(CPU::PGXP);
     23 
     24 // #define LOG_VALUES 1
     25 // #define LOG_LOOKUPS 1
     26 
     27 // TODO: Don't update flags on Validate(), instead return it.
     28 
     29 namespace CPU::PGXP {
     30 
     31 enum : u32
     32 {
     33   VERTEX_CACHE_WIDTH = 2048,
     34   VERTEX_CACHE_HEIGHT = 2048,
     35   VERTEX_CACHE_SIZE = VERTEX_CACHE_WIDTH * VERTEX_CACHE_HEIGHT,
     36   PGXP_MEM_SIZE = (static_cast<u32>(Bus::RAM_8MB_SIZE) + static_cast<u32>(CPU::SCRATCHPAD_SIZE)) / 4,
     37   PGXP_MEM_SCRATCH_OFFSET = Bus::RAM_8MB_SIZE / 4,
     38 };
     39 
     40 enum : u32
     41 {
     42   VALID_X = (1u << 0),
     43   VALID_Y = (1u << 1),
     44   VALID_Z = (1u << 2),
     45   VALID_LOWZ = (1u << 16),      // Valid Z from the low part of a 32-bit value.
     46   VALID_HIGHZ = (1u << 17),     // Valid Z from the high part of a 32-bit value.
     47   VALID_TAINTED_Z = (1u << 31), // X/Y has been changed, Z may not be accurate.
     48 
     49   VALID_XY = (VALID_X | VALID_Y),
     50   VALID_XYZ = (VALID_X | VALID_Y | VALID_Z),
     51   VALID_ALL = (VALID_X | VALID_Y | VALID_Z),
     52 };
     53 
     54 #define LOWORD_U16(val) (static_cast<u16>(val))
     55 #define HIWORD_U16(val) (static_cast<u16>(static_cast<u32>(val) >> 16))
     56 #define LOWORD_S16(val) (static_cast<s16>(static_cast<u16>(val)))
     57 #define HIWORD_S16(val) (static_cast<s16>(static_cast<u16>(static_cast<u32>(val) >> 16)))
     58 #define SET_LOWORD(val, loword) ((static_cast<u32>(val) & 0xFFFF0000u) | static_cast<u32>(static_cast<u16>(loword)))
     59 #define SET_HIWORD(val, hiword) ((static_cast<u32>(val) & 0x0000FFFFu) | (static_cast<u32>(hiword) << 16))
     60 
     61 static double f16Sign(double val);
     62 static double f16Unsign(double val);
     63 static double f16Overflow(double val);
     64 
     65 static void CacheVertex(u32 value, const PGXPValue& vertex);
     66 static PGXPValue* GetCachedVertex(u32 value);
     67 
     68 static float TruncateVertexPosition(float p);
     69 static bool IsWithinTolerance(float precise_x, float precise_y, int int_x, int int_y);
     70 
     71 static PGXPValue& GetRdValue(Instruction instr);
     72 static PGXPValue& GetRtValue(Instruction instr);
     73 static PGXPValue& ValidateAndGetRtValue(Instruction instr, u32 rtVal);
     74 static PGXPValue& ValidateAndGetRsValue(Instruction instr, u32 rsVal);
     75 static void SetRtValue(Instruction instr, const PGXPValue& val);
     76 static void SetRtValue(Instruction instr, const PGXPValue& val, u32 rtVal);
     77 static PGXPValue& GetSXY0();
     78 static PGXPValue& GetSXY1();
     79 static PGXPValue& GetSXY2();
     80 static PGXPValue& PushSXY();
     81 
     82 static PGXPValue* GetPtr(u32 addr);
     83 static const PGXPValue& ValidateAndLoadMem(u32 addr, u32 value);
     84 static void ValidateAndLoadMem16(PGXPValue& dest, u32 addr, u32 value, bool sign);
     85 
     86 static void CPU_MTC2(u32 reg, const PGXPValue& value, u32 val);
     87 static void CPU_BITWISE(Instruction instr, u32 rdVal, u32 rsVal, u32 rtVal);
     88 static void CPU_SLL(Instruction instr, u32 rtVal, u32 sh);
     89 static void CPU_SRx(Instruction instr, u32 rtVal, u32 sh, bool sign, bool is_variable);
     90 
     91 static void WriteMem(u32 addr, const PGXPValue& value);
     92 static void WriteMem16(u32 addr, const PGXPValue& value);
     93 
     94 static void CopyZIfMissing(PGXPValue& dst, const PGXPValue& src);
     95 static void SelectZ(float& dst_z, u32& dst_flags, const PGXPValue& src1, const PGXPValue& src2);
     96 
     97 #ifdef LOG_VALUES
     98 static void LogInstruction(u32 pc, Instruction instr);
     99 static void LogValue(const char* name, u32 rval, const PGXPValue* val);
    100 static void LogValueStr(SmallStringBase& str, const char* name, u32 rval, const PGXPValue* val);
    101 
    102 // clang-format off
    103 #define LOG_VALUES_NV() do { LogInstruction(CPU::g_state.current_instruction_pc, instr); } while (0)
    104 #define LOG_VALUES_1(name, rval, val) do { LogInstruction(CPU::g_state.current_instruction_pc, instr); LogValue(name, rval, val); } while (0)
    105 #define LOG_VALUES_C1(rnum, rval) do { LogInstruction(CPU::g_state.current_instruction_pc,instr); LogValue(CPU::GetRegName(static_cast<CPU::Reg>(rnum)), rval, &g_state.pgxp_gpr[static_cast<u32>(rnum)]); } while(0)
    106 #define LOG_VALUES_C2(r1num, r1val, r2num, r2val) do { LogInstruction(CPU::g_state.current_instruction_pc,instr); LogValue(CPU::GetRegName(static_cast<CPU::Reg>(r1num)), r1val, &g_state.pgxp_gpr[static_cast<u32>(r1num)]); LogValue(CPU::GetRegName(static_cast<CPU::Reg>(r2num)), r2val, &g_state.pgxp_gpr[static_cast<u32>(r2num)]); } while(0)
    107 #define LOG_VALUES_LOAD(addr, val) do { LogInstruction(CPU::g_state.current_instruction_pc,instr); LogValue(TinyString::from_format("MEM[{:08X}]", addr).c_str(), val, GetPtr(addr)); } while(0)
    108 #define LOG_VALUES_STORE(rnum, rval, addr) do { LOG_VALUES_C1(rnum, rval); std::fprintf(s_log, " addr=%08X", addr); } while(0)
    109 #else
    110 #define LOG_VALUES_NV() (void)0
    111 #define LOG_VALUES_1(name, rval, val) (void)0
    112 #define LOG_VALUES_C1(rnum, rval) (void)0
    113 #define LOG_VALUES_C2(r1num, r1val, r2num, r2val) (void)0
    114 #define LOG_VALUES_LOAD(addr, val) (void)0
    115 #define LOG_VALUES_STORE(rnum, rval, addr) (void)0
    116 #endif
    117 // clang-format on
    118 
    119 static constexpr const PGXPValue INVALID_VALUE = {};
    120 
    121 static PGXPValue* s_mem = nullptr;
    122 static PGXPValue* s_vertex_cache = nullptr;
    123 
    124 #ifdef LOG_VALUES
    125 static std::FILE* s_log;
    126 #endif
    127 } // namespace CPU::PGXP
    128 
    129 void CPU::PGXP::Initialize()
    130 {
    131   std::memset(g_state.pgxp_gpr, 0, sizeof(g_state.pgxp_gpr));
    132   std::memset(g_state.pgxp_cop0, 0, sizeof(g_state.pgxp_cop0));
    133   std::memset(g_state.pgxp_gte, 0, sizeof(g_state.pgxp_gte));
    134 
    135   if (!s_mem)
    136   {
    137     s_mem = static_cast<PGXPValue*>(std::calloc(PGXP_MEM_SIZE, sizeof(PGXPValue)));
    138     if (!s_mem)
    139       Panic("Failed to allocate PGXP memory");
    140   }
    141 
    142   if (g_settings.gpu_pgxp_vertex_cache && !s_vertex_cache)
    143   {
    144     s_vertex_cache = static_cast<PGXPValue*>(std::calloc(VERTEX_CACHE_SIZE, sizeof(PGXPValue)));
    145     if (!s_vertex_cache)
    146     {
    147       ERROR_LOG("Failed to allocate memory for vertex cache, disabling.");
    148       g_settings.gpu_pgxp_vertex_cache = false;
    149     }
    150   }
    151 
    152   if (s_vertex_cache)
    153     std::memset(s_vertex_cache, 0, sizeof(PGXPValue) * VERTEX_CACHE_SIZE);
    154 }
    155 
    156 void CPU::PGXP::Reset()
    157 {
    158   std::memset(g_state.pgxp_gpr, 0, sizeof(g_state.pgxp_gpr));
    159   std::memset(g_state.pgxp_cop0, 0, sizeof(g_state.pgxp_cop0));
    160   std::memset(g_state.pgxp_gte, 0, sizeof(g_state.pgxp_gte));
    161 
    162   if (s_mem)
    163     std::memset(s_mem, 0, sizeof(PGXPValue) * PGXP_MEM_SIZE);
    164 
    165   if (g_settings.gpu_pgxp_vertex_cache && s_vertex_cache)
    166     std::memset(s_vertex_cache, 0, sizeof(PGXPValue) * VERTEX_CACHE_SIZE);
    167 }
    168 
    169 void CPU::PGXP::Shutdown()
    170 {
    171   if (s_vertex_cache)
    172   {
    173     std::free(s_vertex_cache);
    174     s_vertex_cache = nullptr;
    175   }
    176   if (s_mem)
    177   {
    178     std::free(s_mem);
    179     s_mem = nullptr;
    180   }
    181 
    182   std::memset(g_state.pgxp_gte, 0, sizeof(g_state.pgxp_gte));
    183   std::memset(g_state.pgxp_gpr, 0, sizeof(g_state.pgxp_gpr));
    184   std::memset(g_state.pgxp_cop0, 0, sizeof(g_state.pgxp_cop0));
    185 }
    186 
    187 ALWAYS_INLINE_RELEASE double CPU::PGXP::f16Sign(double val)
    188 {
    189   const s32 s = static_cast<s32>(static_cast<s64>(val * (USHRT_MAX + 1)));
    190   return static_cast<double>(s) / static_cast<double>(USHRT_MAX + 1);
    191 }
    192 
    193 ALWAYS_INLINE_RELEASE double CPU::PGXP::f16Unsign(double val)
    194 {
    195   return (val >= 0) ? val : (val + (USHRT_MAX + 1));
    196 }
    197 
    198 ALWAYS_INLINE_RELEASE double CPU::PGXP::f16Overflow(double val)
    199 {
    200   return static_cast<double>(static_cast<s64>(val) >> 16);
    201 }
    202 
    203 ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetRdValue(Instruction instr)
    204 {
    205   return g_state.pgxp_gpr[static_cast<u8>(instr.r.rd.GetValue())];
    206 }
    207 
    208 ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetRtValue(Instruction instr)
    209 {
    210   return g_state.pgxp_gpr[static_cast<u8>(instr.r.rt.GetValue())];
    211 }
    212 
    213 ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::ValidateAndGetRtValue(Instruction instr, u32 rtVal)
    214 {
    215   PGXPValue& ret = g_state.pgxp_gpr[static_cast<u8>(instr.r.rt.GetValue())];
    216   ret.Validate(rtVal);
    217   return ret;
    218 }
    219 
    220 ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::ValidateAndGetRsValue(Instruction instr, u32 rsVal)
    221 {
    222   PGXPValue& ret = g_state.pgxp_gpr[static_cast<u8>(instr.r.rs.GetValue())];
    223   ret.Validate(rsVal);
    224   return ret;
    225 }
    226 
    227 ALWAYS_INLINE void CPU::PGXP::SetRtValue(Instruction instr, const PGXPValue& val)
    228 {
    229   g_state.pgxp_gpr[static_cast<u8>(instr.r.rt.GetValue())] = val;
    230 }
    231 
    232 ALWAYS_INLINE void CPU::PGXP::SetRtValue(Instruction instr, const PGXPValue& val, u32 rtVal)
    233 {
    234   PGXPValue& prtVal = g_state.pgxp_gpr[static_cast<u8>(instr.r.rt.GetValue())];
    235   prtVal = val;
    236   prtVal.value = rtVal;
    237 }
    238 
    239 ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetSXY0()
    240 {
    241   return g_state.pgxp_gte[12];
    242 }
    243 
    244 ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetSXY1()
    245 {
    246   return g_state.pgxp_gte[13];
    247 }
    248 
    249 ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetSXY2()
    250 {
    251   return g_state.pgxp_gte[14];
    252 }
    253 
    254 ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::PushSXY()
    255 {
    256   g_state.pgxp_gte[12] = g_state.pgxp_gte[13];
    257   g_state.pgxp_gte[13] = g_state.pgxp_gte[14];
    258   return g_state.pgxp_gte[14];
    259 }
    260 
    261 ALWAYS_INLINE_RELEASE CPU::PGXPValue* CPU::PGXP::GetPtr(u32 addr)
    262 {
    263 #if 0
    264   if ((addr & CPU::PHYSICAL_MEMORY_ADDRESS_MASK) >= 0x0017A2B4 &&
    265       (addr & CPU::PHYSICAL_MEMORY_ADDRESS_MASK) <= 0x0017A2B4)
    266     __debugbreak();
    267 #endif
    268 
    269   if ((addr & SCRATCHPAD_ADDR_MASK) == SCRATCHPAD_ADDR)
    270     return &s_mem[PGXP_MEM_SCRATCH_OFFSET + ((addr & SCRATCHPAD_OFFSET_MASK) >> 2)];
    271 
    272   const u32 paddr = (addr & PHYSICAL_MEMORY_ADDRESS_MASK);
    273   if (paddr < Bus::RAM_MIRROR_END)
    274     return &s_mem[(paddr & Bus::g_ram_mask) >> 2];
    275   else
    276     return nullptr;
    277 }
    278 
    279 ALWAYS_INLINE_RELEASE const CPU::PGXPValue& CPU::PGXP::ValidateAndLoadMem(u32 addr, u32 value)
    280 {
    281   PGXPValue* pMem = GetPtr(addr);
    282   if (!pMem) [[unlikely]]
    283     return INVALID_VALUE;
    284 
    285   pMem->Validate(value);
    286   return *pMem;
    287 }
    288 
    289 ALWAYS_INLINE_RELEASE void CPU::PGXP::ValidateAndLoadMem16(PGXPValue& dest, u32 addr, u32 value, bool sign)
    290 {
    291   PGXPValue* pMem = GetPtr(addr);
    292   if (!pMem) [[unlikely]]
    293   {
    294     dest = INVALID_VALUE;
    295     return;
    296   }
    297 
    298   // determine if high or low word
    299   const bool hiword = ((addr & 2) != 0);
    300 
    301   // only validate the component we're interested in
    302   pMem->flags = hiword ?
    303                   ((Truncate16(pMem->value >> 16) == Truncate16(value)) ? pMem->flags : (pMem->flags & ~VALID_Y)) :
    304                   ((Truncate16(pMem->value) == Truncate16(value)) ? pMem->flags : (pMem->flags & ~VALID_X));
    305 
    306   // copy whole value
    307   dest = *pMem;
    308 
    309   // if high word then shift
    310   if (hiword)
    311   {
    312     dest.x = dest.y;
    313     dest.flags = (dest.flags & ~VALID_X) | ((dest.flags & VALID_Y) >> 1);
    314   }
    315 
    316   // only set y as valid if x is also valid.. don't want to make fake values
    317   if (dest.flags & VALID_X)
    318   {
    319     dest.y = (dest.x < 0) ? -1.0f * sign : 0.0f;
    320     dest.flags |= VALID_Y;
    321   }
    322   else
    323   {
    324     dest.y = 0.0f;
    325     dest.flags &= ~VALID_Y;
    326   }
    327 
    328   dest.value = value;
    329 }
    330 
    331 ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem(u32 addr, const PGXPValue& value)
    332 {
    333   PGXPValue* pMem = GetPtr(addr);
    334   if (!pMem) [[unlikely]]
    335     return;
    336 
    337   *pMem = value;
    338   pMem->flags |= VALID_LOWZ | VALID_HIGHZ;
    339 }
    340 
    341 ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem16(u32 addr, const PGXPValue& value)
    342 {
    343   PGXPValue* dest = GetPtr(addr);
    344   if (!dest) [[unlikely]]
    345     return;
    346 
    347   // determine if high or low word
    348   const bool hiword = ((addr & 2) != 0);
    349   if (hiword)
    350   {
    351     dest->y = value.x;
    352     dest->flags = (dest->flags & ~VALID_Y) | ((value.flags & VALID_X) << 1);
    353     dest->value = (dest->value & UINT32_C(0x0000FFFF)) | (value.value << 16);
    354   }
    355   else
    356   {
    357     dest->x = value.x;
    358     dest->flags = (dest->flags & ~VALID_X) | (value.flags & VALID_X);
    359     dest->value = (dest->value & UINT32_C(0xFFFF0000)) | (value.value & UINT32_C(0x0000FFFF));
    360   }
    361 
    362   // overwrite z/w if valid
    363   // TODO: Check modified
    364   if (value.flags & VALID_Z)
    365   {
    366     dest->z = value.z;
    367     dest->flags |= VALID_Z | (hiword ? VALID_HIGHZ : VALID_LOWZ);
    368   }
    369   else
    370   {
    371     dest->flags &= hiword ? ~VALID_HIGHZ : ~VALID_LOWZ;
    372     if (dest->flags & VALID_Z && !(dest->flags & (VALID_HIGHZ | VALID_LOWZ)))
    373       dest->flags &= ~VALID_Z;
    374   }
    375 }
    376 
    377 ALWAYS_INLINE_RELEASE void CPU::PGXP::CopyZIfMissing(PGXPValue& dst, const PGXPValue& src)
    378 {
    379   dst.z = (dst.flags & VALID_Z) ? dst.z : src.z;
    380   dst.flags |= (src.flags & VALID_Z);
    381 }
    382 
    383 ALWAYS_INLINE_RELEASE void CPU::PGXP::SelectZ(float& dst_z, u32& dst_flags, const PGXPValue& src1,
    384                                               const PGXPValue& src2)
    385 {
    386   // Prefer src2 if src1 is missing Z, or is potentially an imprecise value, when src2 is precise.
    387   dst_z = (!(src1.flags & VALID_Z) ||
    388            (src1.flags & VALID_TAINTED_Z && (src2.flags & (VALID_Z | VALID_TAINTED_Z)) == VALID_Z)) ?
    389             src2.z :
    390             src1.z;
    391   dst_flags |= ((src1.flags | src2.flags) & VALID_Z);
    392 }
    393 
    394 #ifdef LOG_VALUES
    395 void CPU::PGXP::LogInstruction(u32 pc, Instruction instr)
    396 {
    397   if (!s_log) [[unlikely]]
    398   {
    399     s_log = std::fopen("pgxp.log", "wb");
    400   }
    401   else
    402   {
    403     std::fflush(s_log);
    404     std::fputc('\n', s_log);
    405   }
    406 
    407   SmallString str;
    408   DisassembleInstruction(&str, pc, instr.bits);
    409   std::fprintf(s_log, "%08X %08X %-20s", pc, instr.bits, str.c_str());
    410 }
    411 
    412 void CPU::PGXP::LogValue(const char* name, u32 rval, const PGXPValue* val)
    413 {
    414   if (!s_log) [[unlikely]]
    415     return;
    416 
    417   SmallString str;
    418   LogValueStr(str, name, rval, val);
    419   std::fprintf(s_log, " %s", str.c_str());
    420 }
    421 
    422 void CPU::PGXP::LogValueStr(SmallStringBase& str, const char* name, u32 rval, const PGXPValue* val)
    423 {
    424   str.append_format("{}=[{:08X}", name, rval);
    425   if (!val)
    426   {
    427     str.append(", NULL]");
    428   }
    429   else
    430   {
    431     if (val->value != rval)
    432       str.append_format(", PGXP{:08X}", val->value);
    433 
    434     str.append_format(", {{{},{},{}}}", val->x, val->y, val->z);
    435 
    436     if (val->flags & VALID_ALL)
    437     {
    438       str.append(", valid=");
    439       if (val->flags & VALID_X)
    440         str.append('X');
    441       if (val->flags & VALID_Y)
    442         str.append('Y');
    443       if (val->flags & VALID_Z)
    444         str.append('Z');
    445     }
    446 
    447     // if (val->flags & VALID_TAINTED_Z)
    448     // str.append(", tainted");
    449 
    450     str.append(']');
    451   }
    452 }
    453 
    454 #endif
    455 
    456 void CPU::PGXP::GTE_RTPS(float x, float y, float z, u32 value)
    457 {
    458   PGXPValue& pvalue = PushSXY();
    459   pvalue.x = x;
    460   pvalue.y = y;
    461   pvalue.z = z;
    462   pvalue.value = value;
    463   pvalue.flags = VALID_ALL;
    464 
    465   if (g_settings.gpu_pgxp_vertex_cache)
    466     CacheVertex(value, pvalue);
    467 }
    468 
    469 bool CPU::PGXP::GTE_HasPreciseVertices(u32 sxy0, u32 sxy1, u32 sxy2)
    470 {
    471   PGXPValue& SXY0 = GetSXY0();
    472   SXY0.Validate(sxy0);
    473   PGXPValue& SXY1 = GetSXY1();
    474   SXY1.Validate(sxy1);
    475   PGXPValue& SXY2 = GetSXY2();
    476   SXY2.Validate(sxy2);
    477 
    478   // Don't use accurate clipping for game-constructed values, which don't have a valid Z.
    479   return (((SXY0.flags & SXY1.flags & SXY2.flags & VALID_XYZ) == VALID_XYZ));
    480 }
    481 
    482 float CPU::PGXP::GTE_NCLIP()
    483 {
    484   const PGXPValue& SXY0 = GetSXY0();
    485   const PGXPValue& SXY1 = GetSXY1();
    486   const PGXPValue& SXY2 = GetSXY2();
    487   float nclip = ((SXY0.x * SXY1.y) + (SXY1.x * SXY2.y) + (SXY2.x * SXY0.y) - (SXY0.x * SXY2.y) - (SXY1.x * SXY0.y) -
    488                  (SXY2.x * SXY1.y));
    489 
    490   // ensure fractional values are not incorrectly rounded to 0
    491   const float nclip_abs = std::abs(nclip);
    492   if (0.1f < nclip_abs && nclip_abs < 1.0f)
    493     nclip += (nclip < 0.0f ? -1.0f : 1.0f);
    494 
    495   return nclip;
    496 }
    497 
    498 ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_MTC2(u32 reg, const PGXPValue& value, u32 val)
    499 {
    500   switch (reg)
    501   {
    502     case 15:
    503     {
    504       // push FIFO
    505       PGXPValue& SXY2 = PushSXY();
    506       SXY2 = value;
    507       return;
    508     }
    509 
    510     // read-only registers
    511     case 29:
    512     case 31:
    513     {
    514       return;
    515     }
    516 
    517     default:
    518     {
    519       PGXPValue& gteVal = g_state.pgxp_gte[reg];
    520       gteVal = value;
    521       gteVal.value = val;
    522       return;
    523     }
    524   }
    525 }
    526 
    527 void CPU::PGXP::CPU_MFC2(Instruction instr, u32 rdVal)
    528 {
    529   // CPU[Rt] = GTE_D[Rd]
    530   const u32 idx = instr.cop.Cop2Index();
    531   LOG_VALUES_1(CPU::GetGTERegisterName(idx), rdVal, &g_state.pgxp_gte[idx]);
    532 
    533   PGXPValue& prdVal = g_state.pgxp_gte[idx];
    534   prdVal.Validate(rdVal);
    535   SetRtValue(instr, prdVal, rdVal);
    536 }
    537 
    538 void CPU::PGXP::CPU_MTC2(Instruction instr, u32 rtVal)
    539 {
    540   // GTE_D[Rd] = CPU[Rt]
    541   const u32 idx = instr.cop.Cop2Index();
    542   LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);
    543 
    544   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
    545   CPU_MTC2(idx, prtVal, rtVal);
    546 }
    547 
    548 void CPU::PGXP::CPU_LWC2(Instruction instr, u32 addr, u32 rtVal)
    549 {
    550   // GTE_D[Rt] = Mem[addr]
    551   LOG_VALUES_LOAD(addr, rtVal);
    552 
    553   const PGXPValue& pMem = ValidateAndLoadMem(addr, rtVal);
    554   CPU_MTC2(static_cast<u32>(instr.r.rt.GetValue()), pMem, rtVal);
    555 }
    556 
    557 void CPU::PGXP::CPU_SWC2(Instruction instr, u32 addr, u32 rtVal)
    558 {
    559   //  Mem[addr] = GTE_D[Rt]
    560   const u32 idx = static_cast<u32>(instr.r.rt.GetValue());
    561   PGXPValue& prtVal = g_state.pgxp_gte[idx];
    562 #ifdef LOG_VALUES
    563   LOG_VALUES_1(CPU::GetGTERegisterName(idx), rtVal, &prtVal);
    564   std::fprintf(s_log, " addr=%08X", addr);
    565 #endif
    566   prtVal.Validate(rtVal);
    567   WriteMem(addr, prtVal);
    568 }
    569 
    570 ALWAYS_INLINE_RELEASE void CPU::PGXP::CacheVertex(u32 value, const PGXPValue& vertex)
    571 {
    572   const s16 sx = static_cast<s16>(value & 0xFFFFu);
    573   const s16 sy = static_cast<s16>(value >> 16);
    574   DebugAssert(sx >= -1024 && sx <= 1023 && sy >= -1024 && sy <= 1023);
    575   s_vertex_cache[(sy + 1024) * VERTEX_CACHE_WIDTH + (sx + 1024)] = vertex;
    576 }
    577 
    578 ALWAYS_INLINE_RELEASE CPU::PGXPValue* CPU::PGXP::GetCachedVertex(u32 value)
    579 {
    580   const s16 sx = static_cast<s16>(value & 0xFFFFu);
    581   const s16 sy = static_cast<s16>(value >> 16);
    582   return (sx >= -1024 && sx <= 1023 && sy >= -1024 && sy <= 1013) ?
    583            &s_vertex_cache[(sy + 1024) * VERTEX_CACHE_WIDTH + (sx + 1024)] :
    584            nullptr;
    585 }
    586 
    587 ALWAYS_INLINE_RELEASE float CPU::PGXP::TruncateVertexPosition(float p)
    588 {
    589   const s32 int_part = static_cast<s32>(p);
    590   const float int_part_f = static_cast<float>(int_part);
    591   return static_cast<float>(static_cast<s16>(int_part << 5) >> 5) + (p - int_part_f);
    592 }
    593 
    594 ALWAYS_INLINE_RELEASE bool CPU::PGXP::IsWithinTolerance(float precise_x, float precise_y, int int_x, int int_y)
    595 {
    596   const float tolerance = g_settings.gpu_pgxp_tolerance;
    597   if (tolerance < 0.0f)
    598     return true;
    599 
    600   return (std::abs(precise_x - static_cast<float>(int_x)) <= tolerance &&
    601           std::abs(precise_y - static_cast<float>(int_y)) <= tolerance);
    602 }
    603 
    604 bool CPU::PGXP::GetPreciseVertex(u32 addr, u32 value, int x, int y, int xOffs, int yOffs, float* out_x, float* out_y,
    605                                  float* out_w)
    606 {
    607   const PGXPValue* vert = GetPtr(addr);
    608   if (vert && ((vert->flags & VALID_XY) == VALID_XY) && (vert->value == value))
    609   {
    610     // There is a value here with valid X and Y coordinates
    611     *out_x = TruncateVertexPosition(vert->x) + static_cast<float>(xOffs);
    612     *out_y = TruncateVertexPosition(vert->y) + static_cast<float>(yOffs);
    613     *out_w = vert->z / 32768.0f;
    614 
    615 #ifdef LOG_LOOKUPS
    616     GL_INS_FMT("0x{:08X} {},{} => {},{} ({},{},{}) ({},{})", addr, x, y, *out_x, *out_y,
    617                TruncateVertexPosition(vert->x), TruncateVertexPosition(vert->y), vert->z, std::abs(*out_x - x),
    618                std::abs(*out_y - y));
    619 #endif
    620 
    621     if (IsWithinTolerance(*out_x, *out_y, x, y))
    622     {
    623       // check validity of z component
    624       return ((vert->flags & VALID_Z) == VALID_Z);
    625     }
    626   }
    627 
    628   if (g_settings.gpu_pgxp_vertex_cache)
    629   {
    630     vert = GetCachedVertex(value);
    631     if (vert && (vert->flags & VALID_XY) == VALID_XY)
    632     {
    633       *out_x = TruncateVertexPosition(vert->x) + static_cast<float>(xOffs);
    634       *out_y = TruncateVertexPosition(vert->y) + static_cast<float>(yOffs);
    635       *out_w = vert->z / 32768.0f;
    636 
    637       if (IsWithinTolerance(*out_x, *out_y, x, y))
    638         return false;
    639     }
    640   }
    641 
    642   // no valid value can be found anywhere, use the native PSX data
    643   *out_x = static_cast<float>(x);
    644   *out_y = static_cast<float>(y);
    645   *out_w = 1.0f;
    646   return false;
    647 }
    648 
    649 void CPU::PGXP::CPU_LW(Instruction instr, u32 addr, u32 rtVal)
    650 {
    651   // Rt = Mem[Rs + Im]
    652   LOG_VALUES_LOAD(addr, rtVal);
    653   SetRtValue(instr, ValidateAndLoadMem(addr, rtVal));
    654 }
    655 
    656 void CPU::PGXP::CPU_LBx(Instruction instr, u32 addr, u32 rtVal)
    657 {
    658   LOG_VALUES_LOAD(addr, rtVal);
    659   SetRtValue(instr, INVALID_VALUE);
    660 }
    661 
    662 void CPU::PGXP::CPU_LH(Instruction instr, u32 addr, u32 rtVal)
    663 {
    664   // Rt = Mem[Rs + Im] (sign extended)
    665   LOG_VALUES_LOAD(addr, rtVal);
    666   ValidateAndLoadMem16(GetRtValue(instr), addr, rtVal, true);
    667 }
    668 
    669 void CPU::PGXP::CPU_LHU(Instruction instr, u32 addr, u32 rtVal)
    670 {
    671   // Rt = Mem[Rs + Im] (zero extended)
    672   LOG_VALUES_LOAD(addr, rtVal);
    673   ValidateAndLoadMem16(GetRtValue(instr), addr, rtVal, false);
    674 }
    675 
    676 void CPU::PGXP::CPU_SB(Instruction instr, u32 addr, u32 rtVal)
    677 {
    678   LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr);
    679   WriteMem(addr, INVALID_VALUE);
    680 }
    681 
    682 void CPU::PGXP::CPU_SH(Instruction instr, u32 addr, u32 rtVal)
    683 {
    684   LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr);
    685   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
    686   WriteMem16(addr, prtVal);
    687 }
    688 
    689 void CPU::PGXP::CPU_SW(Instruction instr, u32 addr, u32 rtVal)
    690 {
    691   // Mem[Rs + Im] = Rt
    692   LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr);
    693   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
    694   WriteMem(addr, prtVal);
    695 }
    696 
    697 void CPU::PGXP::CPU_MOVE_Packed(u32 rd_and_rs, u32 rsVal)
    698 {
    699   const u32 Rs = (rd_and_rs & 0xFFu);
    700   const u32 Rd = (rd_and_rs >> 8);
    701   CPU_MOVE(Rd, Rs, rsVal);
    702 }
    703 
    704 void CPU::PGXP::CPU_MOVE(u32 Rd, u32 Rs, u32 rsVal)
    705 {
    706 #ifdef LOG_VALUES
    707   const Instruction instr = {0};
    708   LOG_VALUES_C1(Rs, rsVal);
    709 #endif
    710   PGXPValue& prsVal = g_state.pgxp_gpr[Rs];
    711   prsVal.Validate(rsVal);
    712   g_state.pgxp_gpr[Rd] = prsVal;
    713 }
    714 
    715 void CPU::PGXP::CPU_ADDI(Instruction instr, u32 rsVal)
    716 {
    717   LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);
    718 
    719   // Rt = Rs + Imm (signed)
    720   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
    721 
    722   const u32 immVal = instr.i.imm_sext32();
    723 
    724   PGXPValue& prtVal = GetRtValue(instr);
    725   prtVal = prsVal;
    726 
    727   if (immVal == 0)
    728     return;
    729 
    730   if (rsVal == 0)
    731   {
    732     // x is low precision value
    733     prtVal.x = static_cast<float>(LOWORD_S16(immVal));
    734     prtVal.y = static_cast<float>(HIWORD_S16(immVal));
    735     prtVal.flags |= VALID_X | VALID_Y | VALID_TAINTED_Z;
    736     prtVal.value = immVal;
    737     return;
    738   }
    739 
    740   prtVal.x = static_cast<float>(f16Unsign(prtVal.x));
    741   prtVal.x += static_cast<float>(LOWORD_U16(immVal));
    742 
    743   // carry on over/underflow
    744   const float of = (prtVal.x > USHRT_MAX) ? 1.0f : (prtVal.x < 0.0f) ? -1.0f : 0.0f;
    745   prtVal.x = static_cast<float>(f16Sign(prtVal.x));
    746   prtVal.y += HIWORD_S16(immVal) + of;
    747 
    748   // truncate on overflow/underflow
    749   prtVal.y += (prtVal.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (prtVal.y < SHRT_MIN) ? (USHRT_MAX + 1) : 0.0f;
    750 
    751   prtVal.value = rsVal + immVal;
    752 
    753   prtVal.flags |= VALID_TAINTED_Z;
    754 }
    755 
    756 void CPU::PGXP::CPU_ANDI(Instruction instr, u32 rsVal)
    757 {
    758   LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);
    759 
    760   // Rt = Rs & Imm
    761   const u32 imm = instr.i.imm_zext32();
    762   const u32 rtVal = rsVal & imm;
    763   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
    764   PGXPValue& prtVal = GetRtValue(instr);
    765 
    766   // remove upper 16-bits
    767   prtVal.y = 0.0f;
    768   prtVal.z = prsVal.z;
    769   prtVal.value = rtVal;
    770   prtVal.flags = prsVal.flags | VALID_Y | VALID_TAINTED_Z;
    771 
    772   switch (imm)
    773   {
    774     case 0:
    775     {
    776       // if 0 then x == 0
    777       prtVal.x = 0.0f;
    778       prtVal.flags |= VALID_X;
    779     }
    780     break;
    781 
    782     case 0xFFFFu:
    783     {
    784       // if saturated then x == x
    785       prtVal.x = prsVal.x;
    786     }
    787     break;
    788 
    789     default:
    790     {
    791       // otherwise x is low precision value
    792       prtVal.x = static_cast<float>(LOWORD_S16(rtVal));
    793       prtVal.flags |= VALID_X;
    794     }
    795     break;
    796   }
    797 }
    798 
    799 void CPU::PGXP::CPU_ORI(Instruction instr, u32 rsVal)
    800 {
    801   LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);
    802 
    803   // Rt = Rs | Imm
    804   const u32 imm = instr.i.imm_zext32();
    805   const u32 rtVal = rsVal | imm;
    806 
    807   PGXPValue& pRsVal = ValidateAndGetRsValue(instr, rsVal);
    808   PGXPValue& pRtVal = GetRtValue(instr);
    809   pRtVal = pRsVal;
    810   pRtVal.value = rtVal;
    811 
    812   if (imm == 0) [[unlikely]]
    813   {
    814     // if 0 then x == x
    815   }
    816   else
    817   {
    818     // otherwise x is low precision value
    819     pRtVal.x = static_cast<float>(LOWORD_S16(rtVal));
    820     pRtVal.flags |= VALID_X | VALID_TAINTED_Z;
    821   }
    822 }
    823 
    824 void CPU::PGXP::CPU_XORI(Instruction instr, u32 rsVal)
    825 {
    826   LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);
    827 
    828   // Rt = Rs ^ Imm
    829   const u32 imm = instr.i.imm_zext32();
    830   const u32 rtVal = rsVal ^ imm;
    831 
    832   PGXPValue& pRsVal = ValidateAndGetRsValue(instr, rsVal);
    833   PGXPValue& pRtVal = GetRtValue(instr);
    834   pRtVal = pRsVal;
    835   pRtVal.value = rtVal;
    836 
    837   if (imm == 0) [[unlikely]]
    838   {
    839     // if 0 then x == x
    840   }
    841   else
    842   {
    843     // otherwise x is low precision value
    844     pRtVal.x = static_cast<float>(LOWORD_S16(rtVal));
    845     pRtVal.flags |= VALID_X | VALID_TAINTED_Z;
    846   }
    847 }
    848 
    849 void CPU::PGXP::CPU_SLTI(Instruction instr, u32 rsVal)
    850 {
    851   LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);
    852 
    853   // Rt = Rs < Imm (signed)
    854   const s32 imm = instr.i.imm_s16();
    855   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
    856 
    857   const float fimmx = static_cast<float>(imm);
    858   const float fimmy = fimmx < 0.0f ? -1.0f : 0.0f;
    859 
    860   PGXPValue& prtVal = GetRtValue(instr);
    861   prtVal.x = (prsVal.GetValidY(rsVal) < fimmy || prsVal.GetValidX(rsVal) < fimmx) ? 1.0f : 0.0f;
    862   prtVal.y = 0.0f;
    863   prtVal.z = prsVal.z;
    864   prtVal.flags = prsVal.flags | VALID_X | VALID_Y | VALID_TAINTED_Z;
    865   prtVal.value = BoolToUInt32(static_cast<s32>(rsVal) < imm);
    866 }
    867 
    868 void CPU::PGXP::CPU_SLTIU(Instruction instr, u32 rsVal)
    869 {
    870   LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);
    871 
    872   // Rt = Rs < Imm (Unsigned)
    873   const u32 imm = instr.i.imm_u16();
    874   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
    875 
    876   const float fimmx = static_cast<float>(static_cast<s16>(imm)); // deliberately signed
    877   const float fimmy = fimmx < 0.0f ? -1.0f : 0.0f;
    878 
    879   PGXPValue& prtVal = GetRtValue(instr);
    880   prtVal.x =
    881     (f16Unsign(prsVal.GetValidY(rsVal)) < f16Unsign(fimmy) || f16Unsign(prsVal.GetValidX(rsVal)) < fimmx) ? 1.0f : 0.0f;
    882   prtVal.y = 0.0f;
    883   prtVal.z = prsVal.z;
    884   prtVal.flags = prsVal.flags | VALID_X | VALID_Y | VALID_TAINTED_Z;
    885   prtVal.value = BoolToUInt32(rsVal < imm);
    886 }
    887 
    888 void CPU::PGXP::CPU_LUI(Instruction instr)
    889 {
    890   LOG_VALUES_NV();
    891 
    892   // Rt = Imm << 16
    893   PGXPValue& pRtVal = GetRtValue(instr);
    894   pRtVal.x = 0.0f;
    895   pRtVal.y = static_cast<float>(instr.i.imm_s16());
    896   pRtVal.z = 0.0f;
    897   pRtVal.value = instr.i.imm_zext32() << 16;
    898   pRtVal.flags = VALID_XY;
    899 }
    900 
    901 void CPU::PGXP::CPU_ADD(Instruction instr, u32 rsVal, u32 rtVal)
    902 {
    903   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
    904 
    905   // Rd = Rs + Rt (signed)
    906   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
    907   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
    908   PGXPValue& prdVal = GetRdValue(instr);
    909 
    910   if (rtVal == 0)
    911   {
    912     prdVal = prsVal;
    913     CopyZIfMissing(prdVal, prtVal);
    914   }
    915   else if (rsVal == 0)
    916   {
    917     prdVal = prtVal;
    918     CopyZIfMissing(prdVal, prsVal);
    919   }
    920   else
    921   {
    922     const double x = f16Unsign(prsVal.GetValidX(rsVal)) + f16Unsign(prtVal.GetValidX(rtVal));
    923 
    924     // carry on over/underflow
    925     const float of = (x > USHRT_MAX) ? 1.0f : (x < 0.0f) ? -1.0f : 0.0f;
    926     prdVal.x = static_cast<float>(f16Sign(x));
    927     prdVal.y = prsVal.GetValidY(rsVal) + prtVal.GetValidY(rtVal) + of;
    928 
    929     // truncate on overflow/underflow
    930     prdVal.y += (prdVal.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (prdVal.y < SHRT_MIN) ? (USHRT_MAX + 1) : 0.0f;
    931 
    932     prdVal.value = rsVal + rtVal;
    933 
    934     // valid x/y only if one side had a valid x/y
    935     prdVal.flags = prsVal.flags | (prtVal.flags & VALID_XY) | VALID_TAINTED_Z;
    936 
    937     SelectZ(prdVal.z, prdVal.flags, prsVal, prtVal);
    938   }
    939 }
    940 
    941 void CPU::PGXP::CPU_SUB(Instruction instr, u32 rsVal, u32 rtVal)
    942 {
    943   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
    944 
    945   // Rd = Rs - Rt (signed)
    946   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
    947   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
    948   PGXPValue& prdVal = GetRdValue(instr);
    949 
    950   if (rtVal == 0)
    951   {
    952     prdVal = prsVal;
    953     CopyZIfMissing(prdVal, prtVal);
    954   }
    955   else
    956   {
    957     const double x = f16Unsign(prsVal.GetValidX(rsVal)) - f16Unsign(prtVal.GetValidX(rtVal));
    958 
    959     // carry on over/underflow
    960     const float of = (x > USHRT_MAX) ? 1.0f : (x < 0.0f) ? -1.0f : 0.0f;
    961     prdVal.x = static_cast<float>(f16Sign(x));
    962     prdVal.y = prsVal.GetValidY(rsVal) - (prtVal.GetValidY(rtVal) - of);
    963 
    964     // truncate on overflow/underflow
    965     prdVal.y += (prdVal.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (prdVal.y < SHRT_MIN) ? (USHRT_MAX + 1) : 0.0f;
    966 
    967     prdVal.value = rsVal - rtVal;
    968 
    969     // valid x/y only if one side had a valid x/y
    970     prdVal.flags = prsVal.flags | (prtVal.flags & VALID_XY) | VALID_TAINTED_Z;
    971 
    972     SelectZ(prdVal.z, prdVal.flags, prsVal, prtVal);
    973   }
    974 }
    975 
    976 ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_BITWISE(Instruction instr, u32 rdVal, u32 rsVal, u32 rtVal)
    977 {
    978   // Rd = Rs & Rt
    979   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
    980   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
    981 
    982   float x, y;
    983   if (LOWORD_U16(rdVal) == 0)
    984     x = 0.0f;
    985   else if (LOWORD_U16(rdVal) == LOWORD_U16(rsVal))
    986     x = prsVal.GetValidX(rsVal);
    987   else if (LOWORD_U16(rdVal) == LOWORD_U16(rtVal))
    988     x = prtVal.GetValidX(rtVal);
    989   else
    990     x = static_cast<float>(LOWORD_S16(rdVal));
    991 
    992   if (HIWORD_U16(rdVal) == 0)
    993     y = 0.0f;
    994   else if (HIWORD_U16(rdVal) == HIWORD_U16(rsVal))
    995     y = prsVal.GetValidY(rsVal);
    996   else if (HIWORD_U16(rdVal) == HIWORD_U16(rtVal))
    997     y = prtVal.GetValidY(rtVal);
    998   else
    999     y = static_cast<float>(HIWORD_S16(rdVal));
   1000 
   1001   // Why not write directly to prdVal? Because it might be the same as the source.
   1002   u32 flags = ((prsVal.flags | prtVal.flags) & VALID_XY) ? (VALID_XY | VALID_TAINTED_Z) : 0;
   1003   PGXPValue& prdVal = GetRdValue(instr);
   1004   SelectZ(prdVal.z, flags, prsVal, prtVal);
   1005   prdVal.x = x;
   1006   prdVal.y = y;
   1007   prdVal.flags = flags;
   1008   prdVal.value = rdVal;
   1009 }
   1010 
   1011 void CPU::PGXP::CPU_AND_(Instruction instr, u32 rsVal, u32 rtVal)
   1012 {
   1013   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1014 
   1015   // Rd = Rs & Rt
   1016   const u32 rdVal = rsVal & rtVal;
   1017   CPU_BITWISE(instr, rdVal, rsVal, rtVal);
   1018 }
   1019 
   1020 void CPU::PGXP::CPU_OR_(Instruction instr, u32 rsVal, u32 rtVal)
   1021 {
   1022   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1023 
   1024   // Rd = Rs | Rt
   1025   const u32 rdVal = rsVal | rtVal;
   1026   CPU_BITWISE(instr, rdVal, rsVal, rtVal);
   1027 }
   1028 
   1029 void CPU::PGXP::CPU_XOR_(Instruction instr, u32 rsVal, u32 rtVal)
   1030 {
   1031   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1032 
   1033   // Rd = Rs ^ Rt
   1034   const u32 rdVal = rsVal ^ rtVal;
   1035   CPU_BITWISE(instr, rdVal, rsVal, rtVal);
   1036 }
   1037 
   1038 void CPU::PGXP::CPU_NOR(Instruction instr, u32 rsVal, u32 rtVal)
   1039 {
   1040   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1041 
   1042   // Rd = Rs NOR Rt
   1043   const u32 rdVal = ~(rsVal | rtVal);
   1044   CPU_BITWISE(instr, rdVal, rsVal, rtVal);
   1045 }
   1046 
   1047 void CPU::PGXP::CPU_SLT(Instruction instr, u32 rsVal, u32 rtVal)
   1048 {
   1049   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1050 
   1051   // Rd = Rs < Rt (signed)
   1052   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
   1053   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1054   PGXPValue& prdVal = GetRdValue(instr);
   1055   prdVal.x = (prsVal.GetValidY(rsVal) < prtVal.GetValidY(rtVal) ||
   1056               f16Unsign(prsVal.GetValidX(rsVal)) < f16Unsign(prtVal.GetValidX(rtVal))) ?
   1057                1.0f :
   1058                0.0f;
   1059   prdVal.y = 0.0f;
   1060   prdVal.z = prsVal.z;
   1061   prdVal.flags = prsVal.flags | VALID_TAINTED_Z | VALID_X | VALID_Y;
   1062   prdVal.value = BoolToUInt32(static_cast<s32>(rsVal) < static_cast<s32>(rtVal));
   1063 }
   1064 
   1065 void CPU::PGXP::CPU_SLTU(Instruction instr, u32 rsVal, u32 rtVal)
   1066 {
   1067   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1068 
   1069   // Rd = Rs < Rt (unsigned)
   1070   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
   1071   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1072   PGXPValue& prdVal = GetRdValue(instr);
   1073   prdVal.x = (f16Unsign(prsVal.GetValidY(rsVal)) < f16Unsign(prtVal.GetValidY(rtVal)) ||
   1074               f16Unsign(prsVal.GetValidX(rsVal)) < f16Unsign(prtVal.GetValidX(rtVal))) ?
   1075                1.0f :
   1076                0.0f;
   1077   prdVal.y = 0.0f;
   1078   prdVal.z = prsVal.z;
   1079   prdVal.flags = prsVal.flags | VALID_TAINTED_Z | VALID_X | VALID_Y;
   1080   prdVal.value = BoolToUInt32(rsVal < rtVal);
   1081 }
   1082 
   1083 void CPU::PGXP::CPU_MULT(Instruction instr, u32 rsVal, u32 rtVal)
   1084 {
   1085   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1086 
   1087   // Hi/Lo = Rs * Rt (signed)
   1088   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
   1089   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1090 
   1091   PGXPValue& ploVal = g_state.pgxp_gpr[static_cast<u8>(Reg::lo)];
   1092   PGXPValue& phiVal = g_state.pgxp_gpr[static_cast<u8>(Reg::hi)];
   1093   ploVal = prsVal;
   1094   CopyZIfMissing(ploVal, prsVal);
   1095 
   1096   // Z/valid is the same
   1097   phiVal = ploVal;
   1098 
   1099   const float rsx = prsVal.GetValidX(rsVal);
   1100   const float rsy = prsVal.GetValidY(rsVal);
   1101   const float rtx = prtVal.GetValidX(rtVal);
   1102   const float rty = prtVal.GetValidY(rtVal);
   1103 
   1104   // Multiply out components
   1105   const double xx = f16Unsign(rsx) * f16Unsign(rtx);
   1106   const double xy = f16Unsign(rsx) * (rty);
   1107   const double yx = rsy * f16Unsign(rtx);
   1108   const double yy = rsy * rty;
   1109 
   1110   // Split values into outputs
   1111   const double lx = xx;
   1112   const double ly = f16Overflow(xx) + (xy + yx);
   1113   const double hx = f16Overflow(ly) + yy;
   1114   const double hy = f16Overflow(hx);
   1115 
   1116   ploVal.x = static_cast<float>(f16Sign(lx));
   1117   ploVal.y = static_cast<float>(f16Sign(ly));
   1118   ploVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
   1119   phiVal.x = static_cast<float>(f16Sign(hx));
   1120   phiVal.y = static_cast<float>(f16Sign(hy));
   1121   phiVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
   1122 
   1123   // compute PSX value
   1124   const u64 result = static_cast<u64>(static_cast<s64>(SignExtend64(rsVal)) * static_cast<s64>(SignExtend64(rtVal)));
   1125   phiVal.value = Truncate32(result >> 32);
   1126   ploVal.value = Truncate32(result);
   1127 }
   1128 
   1129 void CPU::PGXP::CPU_MULTU(Instruction instr, u32 rsVal, u32 rtVal)
   1130 {
   1131   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1132 
   1133   // Hi/Lo = Rs * Rt (unsigned)
   1134   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
   1135   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1136 
   1137   PGXPValue& ploVal = g_state.pgxp_gpr[static_cast<u8>(Reg::lo)];
   1138   PGXPValue& phiVal = g_state.pgxp_gpr[static_cast<u8>(Reg::hi)];
   1139   ploVal = prsVal;
   1140   CopyZIfMissing(ploVal, prsVal);
   1141 
   1142   // Z/valid is the same
   1143   phiVal = ploVal;
   1144 
   1145   const float rsx = prsVal.GetValidX(rsVal);
   1146   const float rsy = prsVal.GetValidY(rsVal);
   1147   const float rtx = prtVal.GetValidX(rtVal);
   1148   const float rty = prtVal.GetValidY(rtVal);
   1149 
   1150   // Multiply out components
   1151   const double xx = f16Unsign(rsx) * f16Unsign(rtx);
   1152   const double xy = f16Unsign(rsx) * f16Unsign(rty);
   1153   const double yx = f16Unsign(rsy) * f16Unsign(rtx);
   1154   const double yy = f16Unsign(rsy) * f16Unsign(rty);
   1155 
   1156   // Split values into outputs
   1157   const double lx = xx;
   1158   const double ly = f16Overflow(xx) + (xy + yx);
   1159   const double hx = f16Overflow(ly) + yy;
   1160   const double hy = f16Overflow(hx);
   1161 
   1162   ploVal.x = static_cast<float>(f16Sign(lx));
   1163   ploVal.y = static_cast<float>(f16Sign(ly));
   1164   ploVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
   1165   phiVal.x = static_cast<float>(f16Sign(hx));
   1166   phiVal.y = static_cast<float>(f16Sign(hy));
   1167   phiVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
   1168 
   1169   // compute PSX value
   1170   const u64 result = ZeroExtend64(rsVal) * ZeroExtend64(rtVal);
   1171   phiVal.value = Truncate32(result >> 32);
   1172   ploVal.value = Truncate32(result);
   1173 }
   1174 
   1175 void CPU::PGXP::CPU_DIV(Instruction instr, u32 rsVal, u32 rtVal)
   1176 {
   1177   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1178 
   1179   // Lo = Rs / Rt (signed)
   1180   // Hi = Rs % Rt (signed)
   1181   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
   1182   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1183 
   1184   PGXPValue& ploVal = g_state.pgxp_gpr[static_cast<u8>(Reg::lo)];
   1185   PGXPValue& phiVal = g_state.pgxp_gpr[static_cast<u8>(Reg::hi)];
   1186   ploVal = prsVal;
   1187   CopyZIfMissing(ploVal, prsVal);
   1188 
   1189   // Z/valid is the same
   1190   phiVal = ploVal;
   1191 
   1192   const double vs = f16Unsign(prsVal.GetValidX(rsVal)) + prsVal.GetValidY(rsVal) * static_cast<double>(1 << 16);
   1193   const double vt = f16Unsign(prtVal.GetValidX(rtVal)) + prtVal.GetValidY(rtVal) * static_cast<double>(1 << 16);
   1194 
   1195   const double lo = vs / vt;
   1196   ploVal.y = static_cast<float>(f16Sign(f16Overflow(lo)));
   1197   ploVal.x = static_cast<float>(f16Sign(lo));
   1198   ploVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
   1199 
   1200   const double hi = std::fmod(vs, vt);
   1201   phiVal.y = static_cast<float>(f16Sign(f16Overflow(hi)));
   1202   phiVal.x = static_cast<float>(f16Sign(hi));
   1203   phiVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
   1204 
   1205   // compute PSX value
   1206   if (static_cast<s32>(rtVal) == 0)
   1207   {
   1208     // divide by zero
   1209     ploVal.value = (static_cast<s32>(rsVal) >= 0) ? UINT32_C(0xFFFFFFFF) : UINT32_C(1);
   1210     phiVal.value = static_cast<u32>(static_cast<s32>(rsVal));
   1211   }
   1212   else if (rsVal == UINT32_C(0x80000000) && static_cast<s32>(rtVal) == -1)
   1213   {
   1214     // unrepresentable
   1215     ploVal.value = UINT32_C(0x80000000);
   1216     phiVal.value = 0;
   1217   }
   1218   else
   1219   {
   1220     ploVal.value = static_cast<u32>(static_cast<s32>(rsVal) / static_cast<s32>(rtVal));
   1221     phiVal.value = static_cast<u32>(static_cast<s32>(rsVal) % static_cast<s32>(rtVal));
   1222   }
   1223 }
   1224 
   1225 void CPU::PGXP::CPU_DIVU(Instruction instr, u32 rsVal, u32 rtVal)
   1226 {
   1227   LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);
   1228 
   1229   // Lo = Rs / Rt (unsigned)
   1230   // Hi = Rs % Rt (unsigned)
   1231   PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
   1232   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1233 
   1234   PGXPValue& ploVal = g_state.pgxp_gpr[static_cast<u8>(Reg::lo)];
   1235   PGXPValue& phiVal = g_state.pgxp_gpr[static_cast<u8>(Reg::hi)];
   1236   ploVal = prsVal;
   1237   CopyZIfMissing(ploVal, prsVal);
   1238 
   1239   // Z/valid is the same
   1240   phiVal = ploVal;
   1241 
   1242   const double vs =
   1243     f16Unsign(prsVal.GetValidX(rsVal)) + f16Unsign(prsVal.GetValidY(rsVal)) * static_cast<double>(1 << 16);
   1244   const double vt =
   1245     f16Unsign(prtVal.GetValidX(rtVal)) + f16Unsign(prtVal.GetValidY(rtVal)) * static_cast<double>(1 << 16);
   1246 
   1247   const double lo = vs / vt;
   1248   ploVal.y = static_cast<float>(f16Sign(f16Overflow(lo)));
   1249   ploVal.x = static_cast<float>(f16Sign(lo));
   1250   ploVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
   1251 
   1252   const double hi = std::fmod(vs, vt);
   1253   phiVal.y = static_cast<float>(f16Sign(f16Overflow(hi)));
   1254   phiVal.x = static_cast<float>(f16Sign(hi));
   1255   phiVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
   1256 
   1257   if (rtVal == 0)
   1258   {
   1259     // divide by zero
   1260     ploVal.value = UINT32_C(0xFFFFFFFF);
   1261     phiVal.value = rsVal;
   1262   }
   1263   else
   1264   {
   1265     ploVal.value = rsVal / rtVal;
   1266     phiVal.value = rsVal % rtVal;
   1267   }
   1268 }
   1269 
   1270 ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_SLL(Instruction instr, u32 rtVal, u32 sh)
   1271 {
   1272   const u32 rdVal = rtVal << sh;
   1273   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1274   PGXPValue& prdVal = GetRdValue(instr);
   1275   prdVal.z = prtVal.z;
   1276   prdVal.value = rdVal;
   1277 
   1278   if (sh >= 32) [[unlikely]]
   1279   {
   1280     prdVal.x = 0.0f;
   1281     prdVal.y = 0.0f;
   1282     prdVal.flags = prtVal.flags | VALID_XY | VALID_TAINTED_Z;
   1283   }
   1284   else if (sh == 16)
   1285   {
   1286     prdVal.y = prtVal.x;
   1287     prdVal.x = 0.0f;
   1288 
   1289     // Only set valid X if there's also a valid Y. We could use GetValidX() to pull it from the low precision value
   1290     // instead, need to investigate further. Spyro breaks if only X is set even if Y is not valid.
   1291     // prdVal.flags = (prtVal.flags & ~VALID_Y) | ((prtVal.flags & VALID_X) << 1) | VALID_X | VALID_TAINTED_Z;
   1292     prdVal.flags = (prtVal.flags | VALID_TAINTED_Z) | ((prtVal.flags & VALID_Y) >> 1);
   1293   }
   1294   else if (sh >= 16)
   1295   {
   1296     prdVal.y = static_cast<float>(f16Sign(f16Unsign(prtVal.x * static_cast<double>(1 << (sh - 16)))));
   1297     prdVal.x = 0.0f;
   1298 
   1299     // See above.
   1300     // prdVal.flags = (prtVal.flags & ~VALID_Y) | ((prtVal.flags & VALID_X) << 1) | VALID_X | VALID_TAINTED_Z;
   1301     prdVal.flags = (prtVal.flags | VALID_TAINTED_Z) | ((prtVal.flags & VALID_Y) >> 1);
   1302   }
   1303   else
   1304   {
   1305     const double x = f16Unsign(prtVal.x) * static_cast<double>(1 << sh);
   1306     const double y = (f16Unsign(prtVal.y) * static_cast<double>(1 << sh)) + f16Overflow(x);
   1307     prdVal.x = static_cast<float>(f16Sign(x));
   1308     prdVal.y = static_cast<float>(f16Sign(y));
   1309     prdVal.flags = (prtVal.flags | VALID_TAINTED_Z);
   1310   }
   1311 }
   1312 
   1313 void CPU::PGXP::CPU_SLL(Instruction instr, u32 rtVal)
   1314 {
   1315   LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);
   1316 
   1317   // Rd = Rt << Sa
   1318   const u32 sh = instr.r.shamt;
   1319   CPU_SLL(instr, rtVal, sh);
   1320 }
   1321 
   1322 void CPU::PGXP::CPU_SLLV(Instruction instr, u32 rtVal, u32 rsVal)
   1323 {
   1324   LOG_VALUES_C2(instr.r.rt.GetValue(), rtVal, instr.r.rs.GetValue(), rsVal);
   1325 
   1326   // Rd = Rt << Rs
   1327   const u32 sh = rsVal & 0x1F;
   1328   CPU_SLL(instr, rtVal, sh);
   1329 }
   1330 
   1331 ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_SRx(Instruction instr, u32 rtVal, u32 sh, bool sign, bool is_variable)
   1332 {
   1333   const u32 rdVal = sign ? static_cast<u32>(static_cast<s32>(rtVal) >> sh) : (rtVal >> sh);
   1334   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1335 
   1336   double x = prtVal.x;
   1337   double y = sign ? prtVal.y : f16Unsign(prtVal.y);
   1338 
   1339   const u32 iX = SignExtend32(LOWORD_S16(rtVal));   // remove Y
   1340   const u32 iY = SET_LOWORD(rtVal, HIWORD_U16(iX)); // overwrite x with sign(x)
   1341 
   1342   // Shift test values
   1343   const u32 dX = static_cast<u32>(static_cast<s32>(iX) >> sh);
   1344   const u32 dY = sign ? static_cast<u32>(static_cast<s32>(iY) >> sh) : (iY >> sh);
   1345 
   1346   if (LOWORD_S16(dX) != HIWORD_S16(iX))
   1347     x = x / static_cast<double>(1 << sh);
   1348   else
   1349     x = LOWORD_S16(dX); // only sign bits left
   1350 
   1351   if (LOWORD_S16(dY) != HIWORD_S16(iX))
   1352   {
   1353     if (sh == 16)
   1354     {
   1355       x = y;
   1356     }
   1357     else if (sh < 16)
   1358     {
   1359       x += y * static_cast<double>(1 << (16 - sh));
   1360       if (prtVal.x < 0)
   1361         x += static_cast<double>(1 << (16 - sh));
   1362     }
   1363     else
   1364     {
   1365       x += y / static_cast<double>(1 << (sh - 16));
   1366     }
   1367   }
   1368 
   1369   if ((HIWORD_S16(dY) == 0) || (HIWORD_S16(dY) == -1))
   1370     y = HIWORD_S16(dY);
   1371   else
   1372     y = y / static_cast<double>(1 << sh);
   1373 
   1374   PGXPValue& prdVal = GetRdValue(instr);
   1375 
   1376   // Use low precision/rounded values when we're not shifting an entire component,
   1377   // and it's not originally from a 3D value. Too many false positives in P2/etc.
   1378   // What we probably should do is not set the valid flag on non-3D values to begin
   1379   // with, only letting them become valid when used in another expression.
   1380   if (sign && !is_variable && !(prtVal.flags & VALID_Z) && sh < 16)
   1381   {
   1382     prdVal.x = static_cast<float>(LOWORD_S16(rdVal));
   1383     prdVal.y = static_cast<float>(HIWORD_S16(rdVal));
   1384     prdVal.z = 0.0f;
   1385     prdVal.value = rdVal;
   1386     prdVal.flags = VALID_XY | VALID_TAINTED_Z;
   1387   }
   1388   else
   1389   {
   1390     prdVal.x = static_cast<float>(f16Sign(x));
   1391     prdVal.y = static_cast<float>(f16Sign(y));
   1392     prdVal.z = prtVal.z;
   1393     prdVal.value = rdVal;
   1394     prdVal.flags = prtVal.flags | VALID_TAINTED_Z;
   1395   }
   1396 }
   1397 
   1398 void CPU::PGXP::CPU_SRL(Instruction instr, u32 rtVal)
   1399 {
   1400   LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);
   1401 
   1402   // Rd = Rt >> Sa
   1403   const u32 sh = instr.r.shamt;
   1404   CPU_SRx(instr, rtVal, sh, false, false);
   1405 }
   1406 
   1407 void CPU::PGXP::CPU_SRLV(Instruction instr, u32 rtVal, u32 rsVal)
   1408 {
   1409   LOG_VALUES_C2(instr.r.rt.GetValue(), rtVal, instr.r.rs.GetValue(), rsVal);
   1410 
   1411   // Rd = Rt >> Sa
   1412   const u32 sh = rsVal & 0x1F;
   1413   CPU_SRx(instr, rtVal, sh, false, true);
   1414 }
   1415 
   1416 void CPU::PGXP::CPU_SRA(Instruction instr, u32 rtVal)
   1417 {
   1418   LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);
   1419 
   1420   // Rd = Rt >> Sa
   1421   const u32 sh = instr.r.shamt;
   1422   CPU_SRx(instr, rtVal, sh, true, false);
   1423 }
   1424 
   1425 void CPU::PGXP::CPU_SRAV(Instruction instr, u32 rtVal, u32 rsVal)
   1426 {
   1427   LOG_VALUES_C2(instr.r.rt.GetValue(), rtVal, instr.r.rs.GetValue(), rsVal);
   1428 
   1429   // Rd = Rt >> Sa
   1430   const u32 sh = rsVal & 0x1F;
   1431   CPU_SRx(instr, rtVal, sh, true, true);
   1432 }
   1433 
   1434 void CPU::PGXP::CPU_MFC0(Instruction instr, u32 rdVal)
   1435 {
   1436   const u32 idx = static_cast<u8>(instr.r.rd.GetValue());
   1437   LOG_VALUES_1(TinyString::from_format("cop0_{}", idx).c_str(), rdVal, &g_state.pgxp_cop0[idx]);
   1438 
   1439   // CPU[Rt] = CP0[Rd]
   1440   PGXPValue& prdVal = g_state.pgxp_cop0[idx];
   1441   prdVal.Validate(rdVal);
   1442 
   1443   PGXPValue& prtVal = GetRtValue(instr);
   1444   prtVal = prdVal;
   1445   prtVal.value = rdVal;
   1446 }
   1447 
   1448 void CPU::PGXP::CPU_MTC0(Instruction instr, u32 rdVal, u32 rtVal)
   1449 {
   1450   LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);
   1451 
   1452   // CP0[Rd] = CPU[Rt]
   1453   PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
   1454   PGXPValue& prdVal = g_state.pgxp_cop0[static_cast<u8>(instr.r.rd.GetValue())];
   1455   prdVal = prtVal;
   1456   prtVal.value = rdVal;
   1457 }