duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

gte.cpp (41205B)


      1 // SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "gte.h"
      5 
      6 #include "cpu_core.h"
      7 #include "cpu_core_private.h"
      8 #include "cpu_pgxp.h"
      9 #include "settings.h"
     10 
     11 #include "util/gpu_device.h"
     12 #include "util/state_wrapper.h"
     13 
     14 #include "common/assert.h"
     15 #include "common/bitutils.h"
     16 
     17 #include <algorithm>
     18 #include <array>
     19 #include <numeric>
     20 
     21 namespace GTE {
     22 
     23 static constexpr s64 MAC0_MIN_VALUE = -(INT64_C(1) << 31);
     24 static constexpr s64 MAC0_MAX_VALUE = (INT64_C(1) << 31) - 1;
     25 static constexpr s64 MAC123_MIN_VALUE = -(INT64_C(1) << 43);
     26 static constexpr s64 MAC123_MAX_VALUE = (INT64_C(1) << 43) - 1;
     27 static constexpr s32 IR0_MIN_VALUE = 0x0000;
     28 static constexpr s32 IR0_MAX_VALUE = 0x1000;
     29 static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15);
     30 static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1;
     31 
     32 namespace {
     33 struct Config
     34 {
     35   DisplayAspectRatio aspect_ratio = DisplayAspectRatio::R4_3;
     36   u32 custom_aspect_ratio_numerator;
     37   u32 custom_aspect_ratio_denominator;
     38   float custom_aspect_ratio_f;
     39 };
     40 } // namespace
     41 
     42 ALIGN_TO_CACHE_LINE static Config s_config;
     43 
     44 #define REGS CPU::g_state.gte_regs
     45 
     46 ALWAYS_INLINE static u32 CountLeadingBits(u32 value)
     47 {
     48   // if top-most bit is set, we want to count ones not zeros
     49   if (value & UINT32_C(0x80000000))
     50     value ^= UINT32_C(0xFFFFFFFF);
     51 
     52   return (value == 0u) ? 32 : CountLeadingZeros(value);
     53 }
     54 
     55 template<u32 index>
     56 ALWAYS_INLINE static void CheckMACOverflow(s64 value)
     57 {
     58   constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE;
     59   constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE;
     60   if (value < MIN_VALUE)
     61   {
     62     if constexpr (index == 0)
     63       REGS.FLAG.mac0_underflow = true;
     64     else if constexpr (index == 1)
     65       REGS.FLAG.mac1_underflow = true;
     66     else if constexpr (index == 2)
     67       REGS.FLAG.mac2_underflow = true;
     68     else if constexpr (index == 3)
     69       REGS.FLAG.mac3_underflow = true;
     70   }
     71   else if (value > MAX_VALUE)
     72   {
     73     if constexpr (index == 0)
     74       REGS.FLAG.mac0_overflow = true;
     75     else if constexpr (index == 1)
     76       REGS.FLAG.mac1_overflow = true;
     77     else if constexpr (index == 2)
     78       REGS.FLAG.mac2_overflow = true;
     79     else if constexpr (index == 3)
     80       REGS.FLAG.mac3_overflow = true;
     81   }
     82 }
     83 
     84 template<u32 index>
     85 ALWAYS_INLINE static s64 SignExtendMACResult(s64 value)
     86 {
     87   CheckMACOverflow<index>(value);
     88   return SignExtendN < index == 0 ? 31 : 44 > (value);
     89 }
     90 
     91 template<u32 index>
     92 ALWAYS_INLINE static void TruncateAndSetMAC(s64 value, u8 shift)
     93 {
     94   CheckMACOverflow<index>(value);
     95 
     96   // shift should be done before storing to avoid losing precision
     97   value >>= shift;
     98 
     99   REGS.dr32[24 + index] = Truncate32(static_cast<u64>(value));
    100 }
    101 
    102 template<u32 index>
    103 ALWAYS_INLINE static void TruncateAndSetIR(s32 value, bool lm)
    104 {
    105   constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE;
    106   constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE;
    107   const s32 actual_min_value = lm ? 0 : MIN_VALUE;
    108   if (value < actual_min_value)
    109   {
    110     value = actual_min_value;
    111     if constexpr (index == 0)
    112       REGS.FLAG.ir0_saturated = true;
    113     else if constexpr (index == 1)
    114       REGS.FLAG.ir1_saturated = true;
    115     else if constexpr (index == 2)
    116       REGS.FLAG.ir2_saturated = true;
    117     else if constexpr (index == 3)
    118       REGS.FLAG.ir3_saturated = true;
    119   }
    120   else if (value > MAX_VALUE)
    121   {
    122     value = MAX_VALUE;
    123     if constexpr (index == 0)
    124       REGS.FLAG.ir0_saturated = true;
    125     else if constexpr (index == 1)
    126       REGS.FLAG.ir1_saturated = true;
    127     else if constexpr (index == 2)
    128       REGS.FLAG.ir2_saturated = true;
    129     else if constexpr (index == 3)
    130       REGS.FLAG.ir3_saturated = true;
    131   }
    132 
    133   // store sign-extended 16-bit value as 32-bit
    134   REGS.dr32[8 + index] = value;
    135 }
    136 
    137 template<u32 index>
    138 ALWAYS_INLINE static void TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm)
    139 {
    140   CheckMACOverflow<index>(value);
    141 
    142   // shift should be done before storing to avoid losing precision
    143   value >>= shift;
    144 
    145   // set MAC
    146   const s32 value32 = static_cast<s32>(value);
    147   REGS.dr32[24 + index] = value32;
    148 
    149   // set IR
    150   TruncateAndSetIR<index>(value32, lm);
    151 }
    152 
    153 template<u32 index>
    154 ALWAYS_INLINE static u32 TruncateRGB(s32 value)
    155 {
    156   if (value < 0 || value > 0xFF)
    157   {
    158     if constexpr (index == 0)
    159       REGS.FLAG.color_r_saturated = true;
    160     else if constexpr (index == 1)
    161       REGS.FLAG.color_g_saturated = true;
    162     else
    163       REGS.FLAG.color_b_saturated = true;
    164 
    165     return (value < 0) ? 0 : 0xFF;
    166   }
    167 
    168   return static_cast<u32>(value);
    169 }
    170 
    171 static void SetOTZ(s32 value);
    172 static void PushSXY(s32 x, s32 y);
    173 static void PushSZ(s32 value);
    174 static void PushRGBFromMAC();
    175 static u32 UNRDivide(u32 lhs, u32 rhs);
    176 
    177 static void MulMatVec(const s16* M_, const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
    178 static void MulMatVec(const s16* M_, const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
    179 static void MulMatVecBuggy(const s16* M_, const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
    180 
    181 static void InterpolateColor(s64 in_MAC1, s64 in_MAC2, s64 in_MAC3, u8 shift, bool lm);
    182 static void RTPS(const s16 V[3], u8 shift, bool lm, bool last);
    183 static void NCS(const s16 V[3], u8 shift, bool lm);
    184 static void NCCS(const s16 V[3], u8 shift, bool lm);
    185 static void NCDS(const s16 V[3], u8 shift, bool lm);
    186 static void DPCS(const u8 color[3], u8 shift, bool lm);
    187 
    188 static void Execute_MVMVA(Instruction inst);
    189 static void Execute_SQR(Instruction inst);
    190 static void Execute_OP(Instruction inst);
    191 static void Execute_RTPS(Instruction inst);
    192 static void Execute_RTPT(Instruction inst);
    193 static void Execute_NCLIP(Instruction inst);
    194 static void Execute_NCLIP_PGXP(Instruction inst);
    195 static void Execute_AVSZ3(Instruction inst);
    196 static void Execute_AVSZ4(Instruction inst);
    197 static void Execute_NCS(Instruction inst);
    198 static void Execute_NCT(Instruction inst);
    199 static void Execute_NCCS(Instruction inst);
    200 static void Execute_NCCT(Instruction inst);
    201 static void Execute_NCDS(Instruction inst);
    202 static void Execute_NCDT(Instruction inst);
    203 static void Execute_CC(Instruction inst);
    204 static void Execute_CDP(Instruction inst);
    205 static void Execute_DPCS(Instruction inst);
    206 static void Execute_DPCT(Instruction inst);
    207 static void Execute_DCPL(Instruction inst);
    208 static void Execute_INTPL(Instruction inst);
    209 static void Execute_GPL(Instruction inst);
    210 static void Execute_GPF(Instruction inst);
    211 
    212 } // namespace GTE
    213 
    214 void GTE::Initialize()
    215 {
    216   s_config.aspect_ratio = DisplayAspectRatio::R4_3;
    217   Reset();
    218 }
    219 
    220 void GTE::Reset()
    221 {
    222   std::memset(&REGS, 0, sizeof(REGS));
    223 }
    224 
    225 bool GTE::DoState(StateWrapper& sw)
    226 {
    227   sw.DoArray(REGS.r32, NUM_DATA_REGS + NUM_CONTROL_REGS);
    228   return !sw.HasError();
    229 }
    230 
    231 void GTE::UpdateAspectRatio()
    232 {
    233   if (!g_settings.gpu_widescreen_hack)
    234   {
    235     s_config.aspect_ratio = DisplayAspectRatio::R4_3;
    236     return;
    237   }
    238 
    239   s_config.aspect_ratio = g_settings.display_aspect_ratio;
    240 
    241   u32 num, denom;
    242   switch (s_config.aspect_ratio)
    243   {
    244     case DisplayAspectRatio::MatchWindow:
    245     {
    246       if (!g_gpu_device)
    247       {
    248         s_config.aspect_ratio = DisplayAspectRatio::R4_3;
    249         return;
    250       }
    251 
    252       num = g_gpu_device->GetWindowWidth();
    253       denom = g_gpu_device->GetWindowHeight();
    254     }
    255     break;
    256 
    257     case DisplayAspectRatio::Custom:
    258     {
    259       num = g_settings.display_aspect_ratio_custom_numerator;
    260       denom = g_settings.display_aspect_ratio_custom_denominator;
    261     }
    262     break;
    263 
    264     default:
    265       return;
    266   }
    267 
    268   // (4 / 3) / (num / denom) => gcd((4 * denom) / (3 * num))
    269   const u32 x = 4u * denom;
    270   const u32 y = 3u * num;
    271   const u32 gcd = std::gcd(x, y);
    272 
    273   s_config.custom_aspect_ratio_numerator = x / gcd;
    274   s_config.custom_aspect_ratio_denominator = y / gcd;
    275 
    276   s_config.custom_aspect_ratio_f =
    277     static_cast<float>((4.0 / 3.0) / (static_cast<double>(num) / static_cast<double>(denom)));
    278 }
    279 
    280 u32 GTE::ReadRegister(u32 index)
    281 {
    282   DebugAssert(index < countof(REGS.r32));
    283 
    284   switch (index)
    285   {
    286     case 15: // SXY3
    287     {
    288       // mirror of SXY2
    289       return REGS.r32[14];
    290     }
    291 
    292     case 28: // IRGB
    293     case 29: // ORGB
    294     {
    295       // ORGB register, convert 16-bit to 555
    296       const u8 r = static_cast<u8>(std::clamp(REGS.IR1 / 0x80, 0x00, 0x1F));
    297       const u8 g = static_cast<u8>(std::clamp(REGS.IR2 / 0x80, 0x00, 0x1F));
    298       const u8 b = static_cast<u8>(std::clamp(REGS.IR3 / 0x80, 0x00, 0x1F));
    299       return ZeroExtend32(r) | (ZeroExtend32(g) << 5) | (ZeroExtend32(b) << 10);
    300     }
    301 
    302     default:
    303       return REGS.r32[index];
    304   }
    305 }
    306 
    307 void GTE::WriteRegister(u32 index, u32 value)
    308 {
    309 #if 0
    310   if (index < 32)
    311   {
    312     Log_DebugPrintf("DataReg(%u) <- 0x%08X", index, value);
    313   }
    314   else
    315   {
    316     Log_DebugPrintf("ControlReg(%u) <- 0x%08X", index, value);
    317   }
    318 #endif
    319 
    320   switch (index)
    321   {
    322     case 1:  // V0[z]
    323     case 3:  // V1[z]
    324     case 5:  // V2[z]
    325     case 8:  // IR0
    326     case 9:  // IR1
    327     case 10: // IR2
    328     case 11: // IR3
    329     case 36: // RT33
    330     case 44: // L33
    331     case 52: // LR33
    332     case 58: // H       - sign-extended on read but zext on use
    333     case 59: // DQA
    334     case 61: // ZSF3
    335     case 62: // ZSF4
    336     {
    337       // sign-extend z component of vector registers
    338       REGS.r32[index] = SignExtend32(Truncate16(value));
    339     }
    340     break;
    341 
    342     case 7:  // OTZ
    343     case 16: // SZ0
    344     case 17: // SZ1
    345     case 18: // SZ2
    346     case 19: // SZ3
    347     {
    348       // zero-extend unsigned values
    349       REGS.r32[index] = ZeroExtend32(Truncate16(value));
    350     }
    351     break;
    352 
    353     case 15: // SXY3
    354     {
    355       // writing to SXYP pushes to the FIFO
    356       REGS.r32[12] = REGS.r32[13]; // SXY0 <- SXY1
    357       REGS.r32[13] = REGS.r32[14]; // SXY1 <- SXY2
    358       REGS.r32[14] = value;        // SXY2 <- SXYP
    359     }
    360     break;
    361 
    362     case 28: // IRGB
    363     {
    364       // IRGB register, convert 555 to 16-bit
    365       REGS.IRGB = value & UINT32_C(0x7FFF);
    366       REGS.r32[9] = SignExtend32(static_cast<u16>(Truncate16((value & UINT32_C(0x1F)) * UINT32_C(0x80))));
    367       REGS.r32[10] = SignExtend32(static_cast<u16>(Truncate16(((value >> 5) & UINT32_C(0x1F)) * UINT32_C(0x80))));
    368       REGS.r32[11] = SignExtend32(static_cast<u16>(Truncate16(((value >> 10) & UINT32_C(0x1F)) * UINT32_C(0x80))));
    369     }
    370     break;
    371 
    372     case 30: // LZCS
    373     {
    374       REGS.LZCS = static_cast<s32>(value);
    375       REGS.LZCR = CountLeadingBits(value);
    376     }
    377     break;
    378 
    379     case 29: // ORGB
    380     case 31: // LZCR
    381     {
    382       // read-only registers
    383     }
    384     break;
    385 
    386     case 63: // FLAG
    387     {
    388       REGS.FLAG.bits = value & UINT32_C(0x7FFFF000);
    389       REGS.FLAG.UpdateError();
    390     }
    391     break;
    392 
    393     default:
    394     {
    395       // written as-is, 2x16 or 1x32 bits
    396       REGS.r32[index] = value;
    397     }
    398     break;
    399   }
    400 }
    401 
    402 u32* GTE::GetRegisterPtr(u32 index)
    403 {
    404   return &REGS.r32[index];
    405 }
    406 
    407 ALWAYS_INLINE void GTE::SetOTZ(s32 value)
    408 {
    409   if (value < 0)
    410   {
    411     REGS.FLAG.sz1_otz_saturated = true;
    412     value = 0;
    413   }
    414   else if (value > 0xFFFF)
    415   {
    416     REGS.FLAG.sz1_otz_saturated = true;
    417     value = 0xFFFF;
    418   }
    419 
    420   REGS.dr32[7] = static_cast<u32>(value);
    421 }
    422 
    423 ALWAYS_INLINE void GTE::PushSXY(s32 x, s32 y)
    424 {
    425   if (x < -1024)
    426   {
    427     REGS.FLAG.sx2_saturated = true;
    428     x = -1024;
    429   }
    430   else if (x > 1023)
    431   {
    432     REGS.FLAG.sx2_saturated = true;
    433     x = 1023;
    434   }
    435 
    436   if (y < -1024)
    437   {
    438     REGS.FLAG.sy2_saturated = true;
    439     y = -1024;
    440   }
    441   else if (y > 1023)
    442   {
    443     REGS.FLAG.sy2_saturated = true;
    444     y = 1023;
    445   }
    446 
    447   REGS.dr32[12] = REGS.dr32[13]; // SXY0 <- SXY1
    448   REGS.dr32[13] = REGS.dr32[14]; // SXY1 <- SXY2
    449   REGS.dr32[14] = (static_cast<u32>(x) & 0xFFFFu) | (static_cast<u32>(y) << 16);
    450 }
    451 
    452 ALWAYS_INLINE void GTE::PushSZ(s32 value)
    453 {
    454   if (value < 0)
    455   {
    456     REGS.FLAG.sz1_otz_saturated = true;
    457     value = 0;
    458   }
    459   else if (value > 0xFFFF)
    460   {
    461     REGS.FLAG.sz1_otz_saturated = true;
    462     value = 0xFFFF;
    463   }
    464 
    465   REGS.dr32[16] = REGS.dr32[17];           // SZ0 <- SZ1
    466   REGS.dr32[17] = REGS.dr32[18];           // SZ1 <- SZ2
    467   REGS.dr32[18] = REGS.dr32[19];           // SZ2 <- SZ3
    468   REGS.dr32[19] = static_cast<u32>(value); // SZ3 <- value
    469 }
    470 
    471 ALWAYS_INLINE void GTE::PushRGBFromMAC()
    472 {
    473   // Note: SHR 4 used instead of /16 as the results are different.
    474   const u32 r = TruncateRGB<0>(static_cast<u32>(REGS.MAC1 >> 4));
    475   const u32 g = TruncateRGB<1>(static_cast<u32>(REGS.MAC2 >> 4));
    476   const u32 b = TruncateRGB<2>(static_cast<u32>(REGS.MAC3 >> 4));
    477   const u32 c = ZeroExtend32(REGS.RGBC[3]);
    478 
    479   REGS.dr32[20] = REGS.dr32[21];                        // RGB0 <- RGB1
    480   REGS.dr32[21] = REGS.dr32[22];                        // RGB1 <- RGB2
    481   REGS.dr32[22] = r | (g << 8) | (b << 16) | (c << 24); // RGB2 <- Value
    482 }
    483 
    484 ALWAYS_INLINE u32 GTE::UNRDivide(u32 lhs, u32 rhs)
    485 {
    486   if (rhs * 2 <= lhs)
    487   {
    488     REGS.FLAG.divide_overflow = true;
    489     return 0x1FFFF;
    490   }
    491 
    492   const u32 shift = (rhs == 0) ? 16 : CountLeadingZeros(static_cast<u16>(rhs));
    493   lhs <<= shift;
    494   rhs <<= shift;
    495 
    496   static constexpr std::array<u8, 257> unr_table = {{
    497     0xFF, 0xFD, 0xFB, 0xF9, 0xF7, 0xF5, 0xF3, 0xF1, 0xEF, 0xEE, 0xEC, 0xEA, 0xE8, 0xE6, 0xE4, 0xE3, //
    498     0xE1, 0xDF, 0xDD, 0xDC, 0xDA, 0xD8, 0xD6, 0xD5, 0xD3, 0xD1, 0xD0, 0xCE, 0xCD, 0xCB, 0xC9, 0xC8, //  00h..3Fh
    499     0xC6, 0xC5, 0xC3, 0xC1, 0xC0, 0xBE, 0xBD, 0xBB, 0xBA, 0xB8, 0xB7, 0xB5, 0xB4, 0xB2, 0xB1, 0xB0, //
    500     0xAE, 0xAD, 0xAB, 0xAA, 0xA9, 0xA7, 0xA6, 0xA4, 0xA3, 0xA2, 0xA0, 0x9F, 0x9E, 0x9C, 0x9B, 0x9A, //
    501     0x99, 0x97, 0x96, 0x95, 0x94, 0x92, 0x91, 0x90, 0x8F, 0x8D, 0x8C, 0x8B, 0x8A, 0x89, 0x87, 0x86, //
    502     0x85, 0x84, 0x83, 0x82, 0x81, 0x7F, 0x7E, 0x7D, 0x7C, 0x7B, 0x7A, 0x79, 0x78, 0x77, 0x75, 0x74, //  40h..7Fh
    503     0x73, 0x72, 0x71, 0x70, 0x6F, 0x6E, 0x6D, 0x6C, 0x6B, 0x6A, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64, //
    504     0x63, 0x62, 0x61, 0x60, 0x5F, 0x5E, 0x5D, 0x5D, 0x5C, 0x5B, 0x5A, 0x59, 0x58, 0x57, 0x56, 0x55, //
    505     0x54, 0x53, 0x53, 0x52, 0x51, 0x50, 0x4F, 0x4E, 0x4D, 0x4D, 0x4C, 0x4B, 0x4A, 0x49, 0x48, 0x48, //
    506     0x47, 0x46, 0x45, 0x44, 0x43, 0x43, 0x42, 0x41, 0x40, 0x3F, 0x3F, 0x3E, 0x3D, 0x3C, 0x3C, 0x3B, //  80h..BFh
    507     0x3A, 0x39, 0x39, 0x38, 0x37, 0x36, 0x36, 0x35, 0x34, 0x33, 0x33, 0x32, 0x31, 0x31, 0x30, 0x2F, //
    508     0x2E, 0x2E, 0x2D, 0x2C, 0x2C, 0x2B, 0x2A, 0x2A, 0x29, 0x28, 0x28, 0x27, 0x26, 0x26, 0x25, 0x24, //
    509     0x24, 0x23, 0x22, 0x22, 0x21, 0x20, 0x20, 0x1F, 0x1E, 0x1E, 0x1D, 0x1D, 0x1C, 0x1B, 0x1B, 0x1A, //
    510     0x19, 0x19, 0x18, 0x18, 0x17, 0x16, 0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x11, //  C0h..FFh
    511     0x10, 0x0F, 0x0F, 0x0E, 0x0E, 0x0D, 0x0D, 0x0C, 0x0C, 0x0B, 0x0A, 0x0A, 0x09, 0x09, 0x08, 0x08, //
    512     0x07, 0x07, 0x06, 0x06, 0x05, 0x05, 0x04, 0x04, 0x03, 0x03, 0x02, 0x02, 0x01, 0x01, 0x00, 0x00, //
    513     0x00 // <-- one extra table entry (for "(d-7FC0h)/80h"=100h)
    514   }};
    515 
    516   const u32 divisor = rhs | 0x8000;
    517   const s32 x = static_cast<s32>(0x101 + ZeroExtend32(unr_table[((divisor & 0x7FFF) + 0x40) >> 7]));
    518   const s32 d = ((static_cast<s32>(ZeroExtend32(divisor)) * -x) + 0x80) >> 8;
    519   const u32 recip = static_cast<u32>(((x * (0x20000 + d)) + 0x80) >> 8);
    520 
    521   const u32 result = Truncate32((ZeroExtend64(lhs) * ZeroExtend64(recip) + u64(0x8000)) >> 16);
    522 
    523   // The min(1FFFFh) limit is needed for cases like FE3Fh/7F20h, F015h/780Bh, etc. (these do produce UNR result 20000h,
    524   // and are saturated to 1FFFFh, but without setting overflow FLAG bits).
    525   return std::min<u32>(0x1FFFF, result);
    526 }
    527 
    528 void GTE::MulMatVec(const s16* M_, const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
    529 {
    530 #define M(i, j) M_[((i) * 3) + (j)]
    531 #define dot3(i)                                                                                                        \
    532   TruncateAndSetMACAndIR<i + 1>(SignExtendMACResult<i + 1>((s64(M(i, 0)) * s64(Vx)) + (s64(M(i, 1)) * s64(Vy))) +      \
    533                                   (s64(M(i, 2)) * s64(Vz)),                                                            \
    534                                 shift, lm)
    535 
    536   dot3(0);
    537   dot3(1);
    538   dot3(2);
    539 
    540 #undef dot3
    541 #undef M
    542 }
    543 
    544 void GTE::MulMatVec(const s16* M_, const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
    545 {
    546 #define M(i, j) M_[((i) * 3) + (j)]
    547 #define dot3(i)                                                                                                        \
    548   TruncateAndSetMACAndIR<i + 1>(                                                                                       \
    549     SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>((s64(T[i]) << 12) + (s64(M(i, 0)) * s64(Vx))) +              \
    550                                (s64(M(i, 1)) * s64(Vy))) +                                                             \
    551       (s64(M(i, 2)) * s64(Vz)),                                                                                        \
    552     shift, lm)
    553 
    554   dot3(0);
    555   dot3(1);
    556   dot3(2);
    557 
    558 #undef dot3
    559 #undef M
    560 }
    561 
    562 void GTE::MulMatVecBuggy(const s16* M_, const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
    563 {
    564 #define M(i, j) M_[((i) * 3) + (j)]
    565 #define dot3(i)                                                                                                        \
    566   do                                                                                                                   \
    567   {                                                                                                                    \
    568     TruncateAndSetIR<i + 1>(static_cast<s32>(SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>(                    \
    569                                                (s64(T[i]) << 12) + (s64(M(i, 0)) * s64(Vx)))) >>                       \
    570                                              shift),                                                                   \
    571                             false);                                                                                    \
    572     TruncateAndSetMACAndIR<i + 1>(SignExtendMACResult<i + 1>((s64(M(i, 1)) * s64(Vy))) + (s64(M(i, 2)) * s64(Vz)),     \
    573                                   shift, lm);                                                                          \
    574   } while (0)
    575 
    576   dot3(0);
    577   dot3(1);
    578   dot3(2);
    579 
    580 #undef dot3
    581 #undef M
    582 }
    583 
    584 void GTE::Execute_MVMVA(Instruction inst)
    585 {
    586   REGS.FLAG.Clear();
    587 
    588   static constexpr const s16* M_lookup[4] = {&REGS.RT[0][0], &REGS.LLM[0][0], &REGS.LCM[0][0], nullptr};
    589   static constexpr const s16* V_lookup[4][3] = {
    590     {&REGS.V0[0], &REGS.V0[1], &REGS.V0[2]},
    591     {&REGS.V1[0], &REGS.V1[1], &REGS.V1[2]},
    592     {&REGS.V2[0], &REGS.V2[1], &REGS.V2[2]},
    593     {&REGS.IR1, &REGS.IR2, &REGS.IR3},
    594   };
    595   static constexpr const s32 zero_T[3] = {};
    596   static constexpr const s32* T_lookup[4] = {REGS.TR, REGS.BK, REGS.FC, zero_T};
    597 
    598   const s16* M = M_lookup[inst.mvmva_multiply_matrix];
    599   const s16* const* const V = V_lookup[inst.mvmva_multiply_vector];
    600   const s32* const T = T_lookup[inst.mvmva_translation_vector];
    601   s16 buggy_M[3][3];
    602 
    603   if (!M)
    604   {
    605     // buggy
    606     buggy_M[0][0] = -static_cast<s16>(ZeroExtend16(REGS.RGBC[0]) << 4);
    607     buggy_M[0][1] = static_cast<s16>(ZeroExtend16(REGS.RGBC[0]) << 4);
    608     buggy_M[0][2] = REGS.IR0;
    609     buggy_M[1][0] = REGS.RT[0][2];
    610     buggy_M[1][1] = REGS.RT[0][2];
    611     buggy_M[1][2] = REGS.RT[0][2];
    612     buggy_M[2][0] = REGS.RT[1][1];
    613     buggy_M[2][1] = REGS.RT[1][1];
    614     buggy_M[2][2] = REGS.RT[1][1];
    615     M = &buggy_M[0][0];
    616   }
    617 
    618   const s16 Vx = *V[0];
    619   const s16 Vy = *V[1];
    620   const s16 Vz = *V[2];
    621   if (inst.mvmva_translation_vector != 2)
    622     MulMatVec(M, T, Vx, Vy, Vz, inst.GetShift(), inst.lm);
    623   else
    624     MulMatVecBuggy(M, T, Vx, Vy, Vz, inst.GetShift(), inst.lm);
    625 
    626   REGS.FLAG.UpdateError();
    627 }
    628 
    629 void GTE::Execute_SQR(Instruction inst)
    630 {
    631   REGS.FLAG.Clear();
    632 
    633   // 32-bit multiply for speed - 16x16 isn't >32bit, and we know it won't overflow/underflow.
    634   const u8 shift = inst.GetShift();
    635   REGS.MAC1 = (s32(REGS.IR1) * s32(REGS.IR1)) >> shift;
    636   REGS.MAC2 = (s32(REGS.IR2) * s32(REGS.IR2)) >> shift;
    637   REGS.MAC3 = (s32(REGS.IR3) * s32(REGS.IR3)) >> shift;
    638 
    639   const bool lm = inst.lm;
    640   TruncateAndSetIR<1>(REGS.MAC1, lm);
    641   TruncateAndSetIR<2>(REGS.MAC2, lm);
    642   TruncateAndSetIR<3>(REGS.MAC3, lm);
    643 
    644   REGS.FLAG.UpdateError();
    645 }
    646 
    647 void GTE::Execute_OP(Instruction inst)
    648 {
    649   REGS.FLAG.Clear();
    650 
    651   // Take copies since we overwrite them in each step.
    652   const u8 shift = inst.GetShift();
    653   const bool lm = inst.lm;
    654   const s32 D1 = s32(REGS.RT[0][0]);
    655   const s32 D2 = s32(REGS.RT[1][1]);
    656   const s32 D3 = s32(REGS.RT[2][2]);
    657   const s32 IR1 = s32(REGS.IR1);
    658   const s32 IR2 = s32(REGS.IR2);
    659   const s32 IR3 = s32(REGS.IR3);
    660 
    661   // [MAC1,MAC2,MAC3] = [IR3*D2-IR2*D3, IR1*D3-IR3*D1, IR2*D1-IR1*D2] SAR (sf*12)
    662   // [IR1, IR2, IR3] = [MAC1, MAC2, MAC3]; copy result
    663   TruncateAndSetMACAndIR<1>(s64(IR3 * D2) - s64(IR2 * D3), shift, lm);
    664   TruncateAndSetMACAndIR<2>(s64(IR1 * D3) - s64(IR3 * D1), shift, lm);
    665   TruncateAndSetMACAndIR<3>(s64(IR2 * D1) - s64(IR1 * D2), shift, lm);
    666 
    667   REGS.FLAG.UpdateError();
    668 }
    669 
    670 void GTE::RTPS(const s16 V[3], u8 shift, bool lm, bool last)
    671 {
    672 #define dot3(i)                                                                                                        \
    673   SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>((s64(REGS.TR[i]) << 12) + (s64(REGS.RT[i][0]) * s64(V[0]))) +  \
    674                              (s64(REGS.RT[i][1]) * s64(V[1]))) +                                                       \
    675     (s64(REGS.RT[i][2]) * s64(V[2]))
    676 
    677   // IR1 = MAC1 = (TRX*1000h + RT11*VX0 + RT12*VY0 + RT13*VZ0) SAR (sf*12)
    678   // IR2 = MAC2 = (TRY*1000h + RT21*VX0 + RT22*VY0 + RT23*VZ0) SAR (sf*12)
    679   // IR3 = MAC3 = (TRZ*1000h + RT31*VX0 + RT32*VY0 + RT33*VZ0) SAR (sf*12)
    680   const s64 x = dot3(0);
    681   const s64 y = dot3(1);
    682   const s64 z = dot3(2);
    683   TruncateAndSetMAC<1>(x, shift);
    684   TruncateAndSetMAC<2>(y, shift);
    685   TruncateAndSetMAC<3>(z, shift);
    686   TruncateAndSetIR<1>(REGS.MAC1, lm);
    687   TruncateAndSetIR<2>(REGS.MAC2, lm);
    688 
    689   // The command does saturate IR1,IR2,IR3 to -8000h..+7FFFh (regardless of lm bit). When using RTP with sf=0, then the
    690   // IR3 saturation flag (FLAG.22) gets set <only> if "MAC3 SAR 12" exceeds -8000h..+7FFFh (although IR3 is saturated
    691   // when "MAC3" exceeds -8000h..+7FFFh).
    692   TruncateAndSetIR<3>(s32(z >> 12), false);
    693   REGS.dr32[11] = std::clamp(REGS.MAC3, lm ? 0 : IR123_MIN_VALUE, IR123_MAX_VALUE);
    694 #undef dot3
    695 
    696   // SZ3 = MAC3 SAR ((1-sf)*12)                           ;ScreenZ FIFO 0..+FFFFh
    697   PushSZ(s32(z >> 12));
    698 
    699   // MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh
    700   // MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh
    701   const s64 result = static_cast<s64>(ZeroExtend64(UNRDivide(REGS.H, REGS.SZ3)));
    702 
    703   s64 Sx;
    704   switch (s_config.aspect_ratio)
    705   {
    706     case DisplayAspectRatio::R16_9:
    707       Sx = ((((s64(result) * s64(REGS.IR1)) * s64(3)) / s64(4)) + s64(REGS.OFX));
    708       break;
    709 
    710     case DisplayAspectRatio::R19_9:
    711       Sx = ((((s64(result) * s64(REGS.IR1)) * s64(12)) / s64(19)) + s64(REGS.OFX));
    712       break;
    713 
    714     case DisplayAspectRatio::R20_9:
    715       Sx = ((((s64(result) * s64(REGS.IR1)) * s64(3)) / s64(5)) + s64(REGS.OFX));
    716       break;
    717 
    718     case DisplayAspectRatio::Custom:
    719     case DisplayAspectRatio::MatchWindow:
    720       Sx = ((((s64(result) * s64(REGS.IR1)) * s64(s_config.custom_aspect_ratio_numerator)) /
    721              s64(s_config.custom_aspect_ratio_denominator)) +
    722             s64(REGS.OFX));
    723       break;
    724 
    725     case DisplayAspectRatio::Auto:
    726     case DisplayAspectRatio::R4_3:
    727     case DisplayAspectRatio::PAR1_1:
    728     default:
    729       Sx = (s64(result) * s64(REGS.IR1) + s64(REGS.OFX));
    730       break;
    731   }
    732 
    733   const s64 Sy = s64(result) * s64(REGS.IR2) + s64(REGS.OFY);
    734   CheckMACOverflow<0>(Sx);
    735   CheckMACOverflow<0>(Sy);
    736   PushSXY(s32(Sx >> 16), s32(Sy >> 16));
    737 
    738   if (g_settings.gpu_pgxp_enable)
    739   {
    740     float precise_sz3, precise_ir1, precise_ir2;
    741 
    742     if (g_settings.gpu_pgxp_preserve_proj_fp)
    743     {
    744       precise_sz3 = float(z) / 4096.0f;
    745       precise_ir1 = float(x) / (static_cast<float>(1 << shift));
    746       precise_ir2 = float(y) / (static_cast<float>(1 << shift));
    747       if (lm)
    748       {
    749         precise_ir1 = std::clamp(precise_ir1, float(IR123_MIN_VALUE), float(IR123_MAX_VALUE));
    750         precise_ir2 = std::clamp(precise_ir2, float(IR123_MIN_VALUE), float(IR123_MAX_VALUE));
    751       }
    752       else
    753       {
    754         precise_ir1 = std::min(precise_ir1, float(IR123_MAX_VALUE));
    755         precise_ir2 = std::min(precise_ir2, float(IR123_MAX_VALUE));
    756       }
    757     }
    758     else
    759     {
    760       precise_sz3 = float(REGS.SZ3);
    761       precise_ir1 = float(REGS.IR1);
    762       precise_ir2 = float(REGS.IR2);
    763     }
    764 
    765     // this can potentially use increased precision on Z
    766     const float precise_z = std::max<float>(float(REGS.H) / 2.0f, precise_sz3);
    767     const float precise_h_div_sz = float(REGS.H) / precise_z;
    768     const float fofx = float(REGS.OFX) / float(1 << 16);
    769     const float fofy = float(REGS.OFY) / float(1 << 16);
    770     float precise_x = precise_ir1 * precise_h_div_sz;
    771 
    772     switch (s_config.aspect_ratio)
    773     {
    774       case DisplayAspectRatio::MatchWindow:
    775       case DisplayAspectRatio::Custom:
    776         precise_x = precise_x * s_config.custom_aspect_ratio_f;
    777         break;
    778 
    779       case DisplayAspectRatio::R16_9:
    780         precise_x = (precise_x * 3.0f) / 4.0f;
    781         break;
    782 
    783       case DisplayAspectRatio::R19_9:
    784         precise_x = (precise_x * 12.0f) / 19.0f;
    785         break;
    786 
    787       case DisplayAspectRatio::R20_9:
    788         precise_x = (precise_x * 3.0f) / 5.0f;
    789         break;
    790 
    791       case DisplayAspectRatio::Auto:
    792       case DisplayAspectRatio::R4_3:
    793       case DisplayAspectRatio::PAR1_1:
    794       default:
    795         break;
    796     }
    797 
    798     precise_x += fofx;
    799 
    800     float precise_y = fofy + (precise_ir2 * precise_h_div_sz);
    801 
    802     precise_x = std::clamp<float>(precise_x, -1024.0f, 1023.0f);
    803     precise_y = std::clamp<float>(precise_y, -1024.0f, 1023.0f);
    804     CPU::PGXP::GTE_RTPS(precise_x, precise_y, precise_z, REGS.dr32[14]);
    805   }
    806 
    807   if (last)
    808   {
    809     // MAC0=(((H*20000h/SZ3)+1)/2)*DQA+DQB, IR0=MAC0/1000h  ;Depth cueing 0..+1000h
    810     const s64 Sz = s64(result) * s64(REGS.DQA) + s64(REGS.DQB);
    811     TruncateAndSetMAC<0>(Sz, 0);
    812     TruncateAndSetIR<0>(s32(Sz >> 12), true);
    813   }
    814 }
    815 
    816 void GTE::Execute_RTPS(Instruction inst)
    817 {
    818   REGS.FLAG.Clear();
    819   RTPS(REGS.V0, inst.GetShift(), inst.lm, true);
    820   REGS.FLAG.UpdateError();
    821 }
    822 
    823 void GTE::Execute_RTPT(Instruction inst)
    824 {
    825   REGS.FLAG.Clear();
    826 
    827   const u8 shift = inst.GetShift();
    828   const bool lm = inst.lm;
    829 
    830   RTPS(REGS.V0, shift, lm, false);
    831   RTPS(REGS.V1, shift, lm, false);
    832   RTPS(REGS.V2, shift, lm, true);
    833 
    834   REGS.FLAG.UpdateError();
    835 }
    836 
    837 void GTE::Execute_NCLIP(Instruction inst)
    838 {
    839   // MAC0 =   SX0*SY1 + SX1*SY2 + SX2*SY0 - SX0*SY2 - SX1*SY0 - SX2*SY1
    840   REGS.FLAG.Clear();
    841 
    842   TruncateAndSetMAC<0>(s64(REGS.SXY0[0]) * s64(REGS.SXY1[1]) + s64(REGS.SXY1[0]) * s64(REGS.SXY2[1]) +
    843                          s64(REGS.SXY2[0]) * s64(REGS.SXY0[1]) - s64(REGS.SXY0[0]) * s64(REGS.SXY2[1]) -
    844                          s64(REGS.SXY1[0]) * s64(REGS.SXY0[1]) - s64(REGS.SXY2[0]) * s64(REGS.SXY1[1]),
    845                        0);
    846 
    847   REGS.FLAG.UpdateError();
    848 }
    849 
    850 void GTE::Execute_NCLIP_PGXP(Instruction inst)
    851 {
    852   if (CPU::PGXP::GTE_HasPreciseVertices(REGS.dr32[12], REGS.dr32[13], REGS.dr32[14]))
    853   {
    854     REGS.FLAG.Clear();
    855     REGS.MAC0 = static_cast<s32>(CPU::PGXP::GTE_NCLIP());
    856   }
    857   else
    858   {
    859     Execute_NCLIP(inst);
    860   }
    861 }
    862 
    863 void GTE::Execute_AVSZ3(Instruction inst)
    864 {
    865   REGS.FLAG.Clear();
    866 
    867   const s64 result = s64(REGS.ZSF3) * s32(u32(REGS.SZ1) + u32(REGS.SZ2) + u32(REGS.SZ3));
    868   TruncateAndSetMAC<0>(result, 0);
    869   SetOTZ(s32(result >> 12));
    870 
    871   REGS.FLAG.UpdateError();
    872 }
    873 
    874 void GTE::Execute_AVSZ4(Instruction inst)
    875 {
    876   REGS.FLAG.Clear();
    877 
    878   const s64 result = s64(REGS.ZSF4) * s32(u32(REGS.SZ0) + u32(REGS.SZ1) + u32(REGS.SZ2) + u32(REGS.SZ3));
    879   TruncateAndSetMAC<0>(result, 0);
    880   SetOTZ(s32(result >> 12));
    881 
    882   REGS.FLAG.UpdateError();
    883 }
    884 
    885 ALWAYS_INLINE void GTE::InterpolateColor(s64 in_MAC1, s64 in_MAC2, s64 in_MAC3, u8 shift, bool lm)
    886 {
    887   // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0
    888   //   [IR1,IR2,IR3] = (([RFC,GFC,BFC] SHL 12) - [MAC1,MAC2,MAC3]) SAR (sf*12)
    889   TruncateAndSetMACAndIR<1>((s64(REGS.FC[0]) << 12) - in_MAC1, shift, false);
    890   TruncateAndSetMACAndIR<2>((s64(REGS.FC[1]) << 12) - in_MAC2, shift, false);
    891   TruncateAndSetMACAndIR<3>((s64(REGS.FC[2]) << 12) - in_MAC3, shift, false);
    892 
    893   //   [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3])
    894   // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12)
    895   TruncateAndSetMACAndIR<1>(s64(s32(REGS.IR1) * s32(REGS.IR0)) + in_MAC1, shift, lm);
    896   TruncateAndSetMACAndIR<2>(s64(s32(REGS.IR2) * s32(REGS.IR0)) + in_MAC2, shift, lm);
    897   TruncateAndSetMACAndIR<3>(s64(s32(REGS.IR3) * s32(REGS.IR0)) + in_MAC3, shift, lm);
    898 }
    899 
    900 void GTE::NCS(const s16 V[3], u8 shift, bool lm)
    901 {
    902   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12)
    903   MulMatVec(&REGS.LLM[0][0], V[0], V[1], V[2], shift, lm);
    904 
    905   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12)
    906   MulMatVec(&REGS.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm);
    907 
    908   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
    909   PushRGBFromMAC();
    910 }
    911 
    912 void GTE::Execute_NCS(Instruction inst)
    913 {
    914   REGS.FLAG.Clear();
    915 
    916   NCS(REGS.V0, inst.GetShift(), inst.lm);
    917 
    918   REGS.FLAG.UpdateError();
    919 }
    920 
    921 void GTE::Execute_NCT(Instruction inst)
    922 {
    923   REGS.FLAG.Clear();
    924 
    925   const u8 shift = inst.GetShift();
    926   const bool lm = inst.lm;
    927 
    928   NCS(REGS.V0, shift, lm);
    929   NCS(REGS.V1, shift, lm);
    930   NCS(REGS.V2, shift, lm);
    931 
    932   REGS.FLAG.UpdateError();
    933 }
    934 
    935 void GTE::NCCS(const s16 V[3], u8 shift, bool lm)
    936 {
    937   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12)
    938   MulMatVec(&REGS.LLM[0][0], V[0], V[1], V[2], shift, lm);
    939 
    940   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12)
    941   MulMatVec(&REGS.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm);
    942 
    943   // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4          ;<--- for NCDx/NCCx
    944   // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12)       ;<--- for NCDx/NCCx
    945   TruncateAndSetMACAndIR<1>(s64(s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4, shift, lm);
    946   TruncateAndSetMACAndIR<2>(s64(s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4, shift, lm);
    947   TruncateAndSetMACAndIR<3>(s64(s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4, shift, lm);
    948 
    949   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
    950   PushRGBFromMAC();
    951 }
    952 
    953 void GTE::Execute_NCCS(Instruction inst)
    954 {
    955   REGS.FLAG.Clear();
    956 
    957   NCCS(REGS.V0, inst.GetShift(), inst.lm);
    958 
    959   REGS.FLAG.UpdateError();
    960 }
    961 
    962 void GTE::Execute_NCCT(Instruction inst)
    963 {
    964   REGS.FLAG.Clear();
    965 
    966   const u8 shift = inst.GetShift();
    967   const bool lm = inst.lm;
    968 
    969   NCCS(REGS.V0, shift, lm);
    970   NCCS(REGS.V1, shift, lm);
    971   NCCS(REGS.V2, shift, lm);
    972 
    973   REGS.FLAG.UpdateError();
    974 }
    975 
    976 void GTE::NCDS(const s16 V[3], u8 shift, bool lm)
    977 {
    978   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12)
    979   MulMatVec(&REGS.LLM[0][0], V[0], V[1], V[2], shift, lm);
    980 
    981   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12)
    982   MulMatVec(&REGS.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm);
    983 
    984   // No need to assign these to MAC[1-3], as it'll never overflow.
    985   // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4          ;<--- for NCDx/NCCx
    986   const s32 in_MAC1 = (s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4;
    987   const s32 in_MAC2 = (s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4;
    988   const s32 in_MAC3 = (s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4;
    989 
    990   // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0                   ;<--- for NCDx only
    991   InterpolateColor(in_MAC1, in_MAC2, in_MAC3, shift, lm);
    992 
    993   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
    994   PushRGBFromMAC();
    995 }
    996 
    997 void GTE::Execute_NCDS(Instruction inst)
    998 {
    999   REGS.FLAG.Clear();
   1000 
   1001   NCDS(REGS.V0, inst.GetShift(), inst.lm);
   1002 
   1003   REGS.FLAG.UpdateError();
   1004 }
   1005 
   1006 void GTE::Execute_NCDT(Instruction inst)
   1007 {
   1008   REGS.FLAG.Clear();
   1009 
   1010   const u8 shift = inst.GetShift();
   1011   const bool lm = inst.lm;
   1012 
   1013   NCDS(REGS.V0, shift, lm);
   1014   NCDS(REGS.V1, shift, lm);
   1015   NCDS(REGS.V2, shift, lm);
   1016 
   1017   REGS.FLAG.UpdateError();
   1018 }
   1019 
   1020 void GTE::Execute_CC(Instruction inst)
   1021 {
   1022   REGS.FLAG.Clear();
   1023 
   1024   const u8 shift = inst.GetShift();
   1025   const bool lm = inst.lm;
   1026 
   1027   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12)
   1028   MulMatVec(&REGS.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm);
   1029 
   1030   // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4
   1031   // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12)
   1032   TruncateAndSetMACAndIR<1>(s64(s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4, shift, lm);
   1033   TruncateAndSetMACAndIR<2>(s64(s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4, shift, lm);
   1034   TruncateAndSetMACAndIR<3>(s64(s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4, shift, lm);
   1035 
   1036   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
   1037   PushRGBFromMAC();
   1038 
   1039   REGS.FLAG.UpdateError();
   1040 }
   1041 
   1042 void GTE::Execute_CDP(Instruction inst)
   1043 {
   1044   REGS.FLAG.Clear();
   1045 
   1046   const u8 shift = inst.GetShift();
   1047   const bool lm = inst.lm;
   1048 
   1049   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12)
   1050   MulMatVec(&REGS.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm);
   1051 
   1052   // No need to assign these to MAC[1-3], as it'll never overflow.
   1053   // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4
   1054   const s32 in_MAC1 = (s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4;
   1055   const s32 in_MAC2 = (s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4;
   1056   const s32 in_MAC3 = (s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4;
   1057 
   1058   // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0                   ;<--- for CDP only
   1059   // [MAC1, MAC2, MAC3] = [MAC1, MAC2, MAC3] SAR(sf * 12)
   1060   InterpolateColor(in_MAC1, in_MAC2, in_MAC3, shift, lm);
   1061 
   1062   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
   1063   PushRGBFromMAC();
   1064 
   1065   REGS.FLAG.UpdateError();
   1066 }
   1067 
   1068 void GTE::DPCS(const u8 color[3], u8 shift, bool lm)
   1069 {
   1070   // In: [IR1,IR2,IR3]=Vector, FC=Far Color, IR0=Interpolation value, CODE=MSB of RGBC
   1071   // [MAC1,MAC2,MAC3] = [R,G,B] SHL 16                     ;<--- for DPCS/DPCT
   1072   TruncateAndSetMAC<1>((s64(ZeroExtend64(color[0])) << 16), 0);
   1073   TruncateAndSetMAC<2>((s64(ZeroExtend64(color[1])) << 16), 0);
   1074   TruncateAndSetMAC<3>((s64(ZeroExtend64(color[2])) << 16), 0);
   1075 
   1076   // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0
   1077   InterpolateColor(REGS.MAC1, REGS.MAC2, REGS.MAC3, shift, lm);
   1078 
   1079   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
   1080   PushRGBFromMAC();
   1081 }
   1082 
   1083 void GTE::Execute_DPCS(Instruction inst)
   1084 {
   1085   REGS.FLAG.Clear();
   1086 
   1087   DPCS(REGS.RGBC, inst.GetShift(), inst.lm);
   1088 
   1089   REGS.FLAG.UpdateError();
   1090 }
   1091 
   1092 void GTE::Execute_DPCT(Instruction inst)
   1093 {
   1094   REGS.FLAG.Clear();
   1095 
   1096   const u8 shift = inst.GetShift();
   1097   const bool lm = inst.lm;
   1098 
   1099   for (u32 i = 0; i < 3; i++)
   1100     DPCS(REGS.RGB0, shift, lm);
   1101 
   1102   REGS.FLAG.UpdateError();
   1103 }
   1104 
   1105 void GTE::Execute_DCPL(Instruction inst)
   1106 {
   1107   REGS.FLAG.Clear();
   1108 
   1109   const u8 shift = inst.GetShift();
   1110   const bool lm = inst.lm;
   1111 
   1112   // No need to assign these to MAC[1-3], as it'll never overflow.
   1113   // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4          ;<--- for DCPL only
   1114   const s32 in_MAC1 = (s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4;
   1115   const s32 in_MAC2 = (s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4;
   1116   const s32 in_MAC3 = (s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4;
   1117 
   1118   // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0
   1119   InterpolateColor(in_MAC1, in_MAC2, in_MAC3, shift, lm);
   1120 
   1121   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
   1122   PushRGBFromMAC();
   1123 
   1124   REGS.FLAG.UpdateError();
   1125 }
   1126 
   1127 void GTE::Execute_INTPL(Instruction inst)
   1128 {
   1129   REGS.FLAG.Clear();
   1130 
   1131   const u8 shift = inst.GetShift();
   1132   const bool lm = inst.lm;
   1133 
   1134   // No need to assign these to MAC[1-3], as it'll never overflow.
   1135   // [MAC1,MAC2,MAC3] = [IR1,IR2,IR3] SHL 12               ;<--- for INTPL only
   1136   // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0
   1137   InterpolateColor(s32(REGS.IR1) << 12, s32(REGS.IR2) << 12, s32(REGS.IR3) << 12, shift, lm);
   1138 
   1139   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
   1140   PushRGBFromMAC();
   1141 
   1142   REGS.FLAG.UpdateError();
   1143 }
   1144 
   1145 void GTE::Execute_GPL(Instruction inst)
   1146 {
   1147   REGS.FLAG.Clear();
   1148 
   1149   const u8 shift = inst.GetShift();
   1150   const bool lm = inst.lm;
   1151 
   1152   // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SHL (sf*12)       ;<--- for GPL only
   1153   // [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3]) SAR (sf*12)
   1154   TruncateAndSetMACAndIR<1>((s64(s32(REGS.IR1) * s32(REGS.IR0)) + (s64(REGS.MAC1) << shift)), shift, lm);
   1155   TruncateAndSetMACAndIR<2>((s64(s32(REGS.IR2) * s32(REGS.IR0)) + (s64(REGS.MAC2) << shift)), shift, lm);
   1156   TruncateAndSetMACAndIR<3>((s64(s32(REGS.IR3) * s32(REGS.IR0)) + (s64(REGS.MAC3) << shift)), shift, lm);
   1157 
   1158   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
   1159   PushRGBFromMAC();
   1160 
   1161   REGS.FLAG.UpdateError();
   1162 }
   1163 
   1164 void GTE::Execute_GPF(Instruction inst)
   1165 {
   1166   REGS.FLAG.Clear();
   1167 
   1168   const u8 shift = inst.GetShift();
   1169   const bool lm = inst.lm;
   1170 
   1171   // [MAC1,MAC2,MAC3] = [0,0,0]                            ;<--- for GPF only
   1172   // [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3]) SAR (sf*12)
   1173   TruncateAndSetMACAndIR<1>(s64(s32(REGS.IR1) * s32(REGS.IR0)), shift, lm);
   1174   TruncateAndSetMACAndIR<2>(s64(s32(REGS.IR2) * s32(REGS.IR0)), shift, lm);
   1175   TruncateAndSetMACAndIR<3>(s64(s32(REGS.IR3) * s32(REGS.IR0)), shift, lm);
   1176 
   1177   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
   1178   PushRGBFromMAC();
   1179 
   1180   REGS.FLAG.UpdateError();
   1181 }
   1182 
   1183 void GTE::ExecuteInstruction(u32 inst_bits)
   1184 {
   1185   const Instruction inst{inst_bits};
   1186   switch (inst.command)
   1187   {
   1188     case 0x01:
   1189       CPU::AddGTETicks(15);
   1190       Execute_RTPS(inst);
   1191       break;
   1192 
   1193     case 0x06:
   1194     {
   1195       CPU::AddGTETicks(8);
   1196       if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling)
   1197         Execute_NCLIP_PGXP(inst);
   1198       else
   1199         Execute_NCLIP(inst);
   1200     }
   1201     break;
   1202 
   1203     case 0x0C:
   1204       CPU::AddGTETicks(6);
   1205       Execute_OP(inst);
   1206       break;
   1207 
   1208     case 0x10:
   1209       CPU::AddGTETicks(8);
   1210       Execute_DPCS(inst);
   1211       break;
   1212 
   1213     case 0x11:
   1214       CPU::AddGTETicks(7);
   1215       Execute_INTPL(inst);
   1216       break;
   1217 
   1218     case 0x12:
   1219       CPU::AddGTETicks(8);
   1220       Execute_MVMVA(inst);
   1221       break;
   1222 
   1223     case 0x13:
   1224       CPU::AddGTETicks(19);
   1225       Execute_NCDS(inst);
   1226       break;
   1227 
   1228     case 0x14:
   1229       CPU::AddGTETicks(13);
   1230       Execute_CDP(inst);
   1231       break;
   1232 
   1233     case 0x16:
   1234       CPU::AddGTETicks(44);
   1235       Execute_NCDT(inst);
   1236       break;
   1237 
   1238     case 0x1B:
   1239       CPU::AddGTETicks(17);
   1240       Execute_NCCS(inst);
   1241       break;
   1242 
   1243     case 0x1C:
   1244       CPU::AddGTETicks(11);
   1245       Execute_CC(inst);
   1246       break;
   1247 
   1248     case 0x1E:
   1249       CPU::AddGTETicks(14);
   1250       Execute_NCS(inst);
   1251       break;
   1252 
   1253     case 0x20:
   1254       CPU::AddGTETicks(30);
   1255       Execute_NCT(inst);
   1256       break;
   1257 
   1258     case 0x28:
   1259       CPU::AddGTETicks(5);
   1260       Execute_SQR(inst);
   1261       break;
   1262 
   1263     case 0x29:
   1264       CPU::AddGTETicks(8);
   1265       Execute_DCPL(inst);
   1266       break;
   1267 
   1268     case 0x2A:
   1269       CPU::AddGTETicks(17);
   1270       Execute_DPCT(inst);
   1271       break;
   1272 
   1273     case 0x2D:
   1274       CPU::AddGTETicks(5);
   1275       Execute_AVSZ3(inst);
   1276       break;
   1277 
   1278     case 0x2E:
   1279       CPU::AddGTETicks(6);
   1280       Execute_AVSZ4(inst);
   1281       break;
   1282 
   1283     case 0x30:
   1284       CPU::AddGTETicks(23);
   1285       Execute_RTPT(inst);
   1286       break;
   1287 
   1288     case 0x3D:
   1289       CPU::AddGTETicks(5);
   1290       Execute_GPF(inst);
   1291       break;
   1292 
   1293     case 0x3E:
   1294       CPU::AddGTETicks(5);
   1295       Execute_GPL(inst);
   1296       break;
   1297 
   1298     case 0x3F:
   1299       CPU::AddGTETicks(39);
   1300       Execute_NCCT(inst);
   1301       break;
   1302 
   1303     default:
   1304       Panic("Missing handler");
   1305       break;
   1306   }
   1307 }
   1308 
   1309 GTE::InstructionImpl GTE::GetInstructionImpl(u32 inst_bits, TickCount* ticks)
   1310 {
   1311   const Instruction inst{inst_bits};
   1312   switch (inst.command)
   1313   {
   1314     case 0x01:
   1315       *ticks = 15;
   1316       return &Execute_RTPS;
   1317 
   1318     case 0x06:
   1319     {
   1320       *ticks = 8;
   1321       if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling)
   1322         return &Execute_NCLIP_PGXP;
   1323       else
   1324         return &Execute_NCLIP;
   1325     }
   1326 
   1327     case 0x0C:
   1328       *ticks = 6;
   1329       return &Execute_OP;
   1330 
   1331     case 0x10:
   1332       *ticks = 8;
   1333       return &Execute_DPCS;
   1334 
   1335     case 0x11:
   1336       *ticks = 7;
   1337       return &Execute_INTPL;
   1338 
   1339     case 0x12:
   1340       *ticks = 8;
   1341       return &Execute_MVMVA;
   1342 
   1343     case 0x13:
   1344       *ticks = 19;
   1345       return &Execute_NCDS;
   1346 
   1347     case 0x14:
   1348       *ticks = 13;
   1349       return &Execute_CDP;
   1350 
   1351     case 0x16:
   1352       *ticks = 44;
   1353       return &Execute_NCDT;
   1354 
   1355     case 0x1B:
   1356       *ticks = 17;
   1357       return &Execute_NCCS;
   1358 
   1359     case 0x1C:
   1360       *ticks = 11;
   1361       return &Execute_CC;
   1362 
   1363     case 0x1E:
   1364       *ticks = 14;
   1365       return &Execute_NCS;
   1366 
   1367     case 0x20:
   1368       *ticks = 30;
   1369       return &Execute_NCT;
   1370 
   1371     case 0x28:
   1372       *ticks = 5;
   1373       return &Execute_SQR;
   1374 
   1375     case 0x29:
   1376       *ticks = 8;
   1377       return &Execute_DCPL;
   1378 
   1379     case 0x2A:
   1380       *ticks = 17;
   1381       return &Execute_DPCT;
   1382 
   1383     case 0x2D:
   1384       *ticks = 5;
   1385       return &Execute_AVSZ3;
   1386 
   1387     case 0x2E:
   1388       *ticks = 6;
   1389       return &Execute_AVSZ4;
   1390 
   1391     case 0x30:
   1392       *ticks = 23;
   1393       return &Execute_RTPT;
   1394 
   1395     case 0x3D:
   1396       *ticks = 5;
   1397       return &Execute_GPF;
   1398 
   1399     case 0x3E:
   1400       *ticks = 5;
   1401       return &Execute_GPL;
   1402 
   1403     case 0x3F:
   1404       *ticks = 39;
   1405       return &Execute_NCCT;
   1406 
   1407     default:
   1408       Panic("Missing handler");
   1409   }
   1410 }