gte.cpp (41205B)
1 // SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com> 2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) 3 4 #include "gte.h" 5 6 #include "cpu_core.h" 7 #include "cpu_core_private.h" 8 #include "cpu_pgxp.h" 9 #include "settings.h" 10 11 #include "util/gpu_device.h" 12 #include "util/state_wrapper.h" 13 14 #include "common/assert.h" 15 #include "common/bitutils.h" 16 17 #include <algorithm> 18 #include <array> 19 #include <numeric> 20 21 namespace GTE { 22 23 static constexpr s64 MAC0_MIN_VALUE = -(INT64_C(1) << 31); 24 static constexpr s64 MAC0_MAX_VALUE = (INT64_C(1) << 31) - 1; 25 static constexpr s64 MAC123_MIN_VALUE = -(INT64_C(1) << 43); 26 static constexpr s64 MAC123_MAX_VALUE = (INT64_C(1) << 43) - 1; 27 static constexpr s32 IR0_MIN_VALUE = 0x0000; 28 static constexpr s32 IR0_MAX_VALUE = 0x1000; 29 static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15); 30 static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1; 31 32 namespace { 33 struct Config 34 { 35 DisplayAspectRatio aspect_ratio = DisplayAspectRatio::R4_3; 36 u32 custom_aspect_ratio_numerator; 37 u32 custom_aspect_ratio_denominator; 38 float custom_aspect_ratio_f; 39 }; 40 } // namespace 41 42 ALIGN_TO_CACHE_LINE static Config s_config; 43 44 #define REGS CPU::g_state.gte_regs 45 46 ALWAYS_INLINE static u32 CountLeadingBits(u32 value) 47 { 48 // if top-most bit is set, we want to count ones not zeros 49 if (value & UINT32_C(0x80000000)) 50 value ^= UINT32_C(0xFFFFFFFF); 51 52 return (value == 0u) ? 32 : CountLeadingZeros(value); 53 } 54 55 template<u32 index> 56 ALWAYS_INLINE static void CheckMACOverflow(s64 value) 57 { 58 constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE; 59 constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE; 60 if (value < MIN_VALUE) 61 { 62 if constexpr (index == 0) 63 REGS.FLAG.mac0_underflow = true; 64 else if constexpr (index == 1) 65 REGS.FLAG.mac1_underflow = true; 66 else if constexpr (index == 2) 67 REGS.FLAG.mac2_underflow = true; 68 else if constexpr (index == 3) 69 REGS.FLAG.mac3_underflow = true; 70 } 71 else if (value > MAX_VALUE) 72 { 73 if constexpr (index == 0) 74 REGS.FLAG.mac0_overflow = true; 75 else if constexpr (index == 1) 76 REGS.FLAG.mac1_overflow = true; 77 else if constexpr (index == 2) 78 REGS.FLAG.mac2_overflow = true; 79 else if constexpr (index == 3) 80 REGS.FLAG.mac3_overflow = true; 81 } 82 } 83 84 template<u32 index> 85 ALWAYS_INLINE static s64 SignExtendMACResult(s64 value) 86 { 87 CheckMACOverflow<index>(value); 88 return SignExtendN < index == 0 ? 31 : 44 > (value); 89 } 90 91 template<u32 index> 92 ALWAYS_INLINE static void TruncateAndSetMAC(s64 value, u8 shift) 93 { 94 CheckMACOverflow<index>(value); 95 96 // shift should be done before storing to avoid losing precision 97 value >>= shift; 98 99 REGS.dr32[24 + index] = Truncate32(static_cast<u64>(value)); 100 } 101 102 template<u32 index> 103 ALWAYS_INLINE static void TruncateAndSetIR(s32 value, bool lm) 104 { 105 constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE; 106 constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE; 107 const s32 actual_min_value = lm ? 0 : MIN_VALUE; 108 if (value < actual_min_value) 109 { 110 value = actual_min_value; 111 if constexpr (index == 0) 112 REGS.FLAG.ir0_saturated = true; 113 else if constexpr (index == 1) 114 REGS.FLAG.ir1_saturated = true; 115 else if constexpr (index == 2) 116 REGS.FLAG.ir2_saturated = true; 117 else if constexpr (index == 3) 118 REGS.FLAG.ir3_saturated = true; 119 } 120 else if (value > MAX_VALUE) 121 { 122 value = MAX_VALUE; 123 if constexpr (index == 0) 124 REGS.FLAG.ir0_saturated = true; 125 else if constexpr (index == 1) 126 REGS.FLAG.ir1_saturated = true; 127 else if constexpr (index == 2) 128 REGS.FLAG.ir2_saturated = true; 129 else if constexpr (index == 3) 130 REGS.FLAG.ir3_saturated = true; 131 } 132 133 // store sign-extended 16-bit value as 32-bit 134 REGS.dr32[8 + index] = value; 135 } 136 137 template<u32 index> 138 ALWAYS_INLINE static void TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm) 139 { 140 CheckMACOverflow<index>(value); 141 142 // shift should be done before storing to avoid losing precision 143 value >>= shift; 144 145 // set MAC 146 const s32 value32 = static_cast<s32>(value); 147 REGS.dr32[24 + index] = value32; 148 149 // set IR 150 TruncateAndSetIR<index>(value32, lm); 151 } 152 153 template<u32 index> 154 ALWAYS_INLINE static u32 TruncateRGB(s32 value) 155 { 156 if (value < 0 || value > 0xFF) 157 { 158 if constexpr (index == 0) 159 REGS.FLAG.color_r_saturated = true; 160 else if constexpr (index == 1) 161 REGS.FLAG.color_g_saturated = true; 162 else 163 REGS.FLAG.color_b_saturated = true; 164 165 return (value < 0) ? 0 : 0xFF; 166 } 167 168 return static_cast<u32>(value); 169 } 170 171 static void SetOTZ(s32 value); 172 static void PushSXY(s32 x, s32 y); 173 static void PushSZ(s32 value); 174 static void PushRGBFromMAC(); 175 static u32 UNRDivide(u32 lhs, u32 rhs); 176 177 static void MulMatVec(const s16* M_, const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm); 178 static void MulMatVec(const s16* M_, const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm); 179 static void MulMatVecBuggy(const s16* M_, const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm); 180 181 static void InterpolateColor(s64 in_MAC1, s64 in_MAC2, s64 in_MAC3, u8 shift, bool lm); 182 static void RTPS(const s16 V[3], u8 shift, bool lm, bool last); 183 static void NCS(const s16 V[3], u8 shift, bool lm); 184 static void NCCS(const s16 V[3], u8 shift, bool lm); 185 static void NCDS(const s16 V[3], u8 shift, bool lm); 186 static void DPCS(const u8 color[3], u8 shift, bool lm); 187 188 static void Execute_MVMVA(Instruction inst); 189 static void Execute_SQR(Instruction inst); 190 static void Execute_OP(Instruction inst); 191 static void Execute_RTPS(Instruction inst); 192 static void Execute_RTPT(Instruction inst); 193 static void Execute_NCLIP(Instruction inst); 194 static void Execute_NCLIP_PGXP(Instruction inst); 195 static void Execute_AVSZ3(Instruction inst); 196 static void Execute_AVSZ4(Instruction inst); 197 static void Execute_NCS(Instruction inst); 198 static void Execute_NCT(Instruction inst); 199 static void Execute_NCCS(Instruction inst); 200 static void Execute_NCCT(Instruction inst); 201 static void Execute_NCDS(Instruction inst); 202 static void Execute_NCDT(Instruction inst); 203 static void Execute_CC(Instruction inst); 204 static void Execute_CDP(Instruction inst); 205 static void Execute_DPCS(Instruction inst); 206 static void Execute_DPCT(Instruction inst); 207 static void Execute_DCPL(Instruction inst); 208 static void Execute_INTPL(Instruction inst); 209 static void Execute_GPL(Instruction inst); 210 static void Execute_GPF(Instruction inst); 211 212 } // namespace GTE 213 214 void GTE::Initialize() 215 { 216 s_config.aspect_ratio = DisplayAspectRatio::R4_3; 217 Reset(); 218 } 219 220 void GTE::Reset() 221 { 222 std::memset(®S, 0, sizeof(REGS)); 223 } 224 225 bool GTE::DoState(StateWrapper& sw) 226 { 227 sw.DoArray(REGS.r32, NUM_DATA_REGS + NUM_CONTROL_REGS); 228 return !sw.HasError(); 229 } 230 231 void GTE::UpdateAspectRatio() 232 { 233 if (!g_settings.gpu_widescreen_hack) 234 { 235 s_config.aspect_ratio = DisplayAspectRatio::R4_3; 236 return; 237 } 238 239 s_config.aspect_ratio = g_settings.display_aspect_ratio; 240 241 u32 num, denom; 242 switch (s_config.aspect_ratio) 243 { 244 case DisplayAspectRatio::MatchWindow: 245 { 246 if (!g_gpu_device) 247 { 248 s_config.aspect_ratio = DisplayAspectRatio::R4_3; 249 return; 250 } 251 252 num = g_gpu_device->GetWindowWidth(); 253 denom = g_gpu_device->GetWindowHeight(); 254 } 255 break; 256 257 case DisplayAspectRatio::Custom: 258 { 259 num = g_settings.display_aspect_ratio_custom_numerator; 260 denom = g_settings.display_aspect_ratio_custom_denominator; 261 } 262 break; 263 264 default: 265 return; 266 } 267 268 // (4 / 3) / (num / denom) => gcd((4 * denom) / (3 * num)) 269 const u32 x = 4u * denom; 270 const u32 y = 3u * num; 271 const u32 gcd = std::gcd(x, y); 272 273 s_config.custom_aspect_ratio_numerator = x / gcd; 274 s_config.custom_aspect_ratio_denominator = y / gcd; 275 276 s_config.custom_aspect_ratio_f = 277 static_cast<float>((4.0 / 3.0) / (static_cast<double>(num) / static_cast<double>(denom))); 278 } 279 280 u32 GTE::ReadRegister(u32 index) 281 { 282 DebugAssert(index < countof(REGS.r32)); 283 284 switch (index) 285 { 286 case 15: // SXY3 287 { 288 // mirror of SXY2 289 return REGS.r32[14]; 290 } 291 292 case 28: // IRGB 293 case 29: // ORGB 294 { 295 // ORGB register, convert 16-bit to 555 296 const u8 r = static_cast<u8>(std::clamp(REGS.IR1 / 0x80, 0x00, 0x1F)); 297 const u8 g = static_cast<u8>(std::clamp(REGS.IR2 / 0x80, 0x00, 0x1F)); 298 const u8 b = static_cast<u8>(std::clamp(REGS.IR3 / 0x80, 0x00, 0x1F)); 299 return ZeroExtend32(r) | (ZeroExtend32(g) << 5) | (ZeroExtend32(b) << 10); 300 } 301 302 default: 303 return REGS.r32[index]; 304 } 305 } 306 307 void GTE::WriteRegister(u32 index, u32 value) 308 { 309 #if 0 310 if (index < 32) 311 { 312 Log_DebugPrintf("DataReg(%u) <- 0x%08X", index, value); 313 } 314 else 315 { 316 Log_DebugPrintf("ControlReg(%u) <- 0x%08X", index, value); 317 } 318 #endif 319 320 switch (index) 321 { 322 case 1: // V0[z] 323 case 3: // V1[z] 324 case 5: // V2[z] 325 case 8: // IR0 326 case 9: // IR1 327 case 10: // IR2 328 case 11: // IR3 329 case 36: // RT33 330 case 44: // L33 331 case 52: // LR33 332 case 58: // H - sign-extended on read but zext on use 333 case 59: // DQA 334 case 61: // ZSF3 335 case 62: // ZSF4 336 { 337 // sign-extend z component of vector registers 338 REGS.r32[index] = SignExtend32(Truncate16(value)); 339 } 340 break; 341 342 case 7: // OTZ 343 case 16: // SZ0 344 case 17: // SZ1 345 case 18: // SZ2 346 case 19: // SZ3 347 { 348 // zero-extend unsigned values 349 REGS.r32[index] = ZeroExtend32(Truncate16(value)); 350 } 351 break; 352 353 case 15: // SXY3 354 { 355 // writing to SXYP pushes to the FIFO 356 REGS.r32[12] = REGS.r32[13]; // SXY0 <- SXY1 357 REGS.r32[13] = REGS.r32[14]; // SXY1 <- SXY2 358 REGS.r32[14] = value; // SXY2 <- SXYP 359 } 360 break; 361 362 case 28: // IRGB 363 { 364 // IRGB register, convert 555 to 16-bit 365 REGS.IRGB = value & UINT32_C(0x7FFF); 366 REGS.r32[9] = SignExtend32(static_cast<u16>(Truncate16((value & UINT32_C(0x1F)) * UINT32_C(0x80)))); 367 REGS.r32[10] = SignExtend32(static_cast<u16>(Truncate16(((value >> 5) & UINT32_C(0x1F)) * UINT32_C(0x80)))); 368 REGS.r32[11] = SignExtend32(static_cast<u16>(Truncate16(((value >> 10) & UINT32_C(0x1F)) * UINT32_C(0x80)))); 369 } 370 break; 371 372 case 30: // LZCS 373 { 374 REGS.LZCS = static_cast<s32>(value); 375 REGS.LZCR = CountLeadingBits(value); 376 } 377 break; 378 379 case 29: // ORGB 380 case 31: // LZCR 381 { 382 // read-only registers 383 } 384 break; 385 386 case 63: // FLAG 387 { 388 REGS.FLAG.bits = value & UINT32_C(0x7FFFF000); 389 REGS.FLAG.UpdateError(); 390 } 391 break; 392 393 default: 394 { 395 // written as-is, 2x16 or 1x32 bits 396 REGS.r32[index] = value; 397 } 398 break; 399 } 400 } 401 402 u32* GTE::GetRegisterPtr(u32 index) 403 { 404 return ®S.r32[index]; 405 } 406 407 ALWAYS_INLINE void GTE::SetOTZ(s32 value) 408 { 409 if (value < 0) 410 { 411 REGS.FLAG.sz1_otz_saturated = true; 412 value = 0; 413 } 414 else if (value > 0xFFFF) 415 { 416 REGS.FLAG.sz1_otz_saturated = true; 417 value = 0xFFFF; 418 } 419 420 REGS.dr32[7] = static_cast<u32>(value); 421 } 422 423 ALWAYS_INLINE void GTE::PushSXY(s32 x, s32 y) 424 { 425 if (x < -1024) 426 { 427 REGS.FLAG.sx2_saturated = true; 428 x = -1024; 429 } 430 else if (x > 1023) 431 { 432 REGS.FLAG.sx2_saturated = true; 433 x = 1023; 434 } 435 436 if (y < -1024) 437 { 438 REGS.FLAG.sy2_saturated = true; 439 y = -1024; 440 } 441 else if (y > 1023) 442 { 443 REGS.FLAG.sy2_saturated = true; 444 y = 1023; 445 } 446 447 REGS.dr32[12] = REGS.dr32[13]; // SXY0 <- SXY1 448 REGS.dr32[13] = REGS.dr32[14]; // SXY1 <- SXY2 449 REGS.dr32[14] = (static_cast<u32>(x) & 0xFFFFu) | (static_cast<u32>(y) << 16); 450 } 451 452 ALWAYS_INLINE void GTE::PushSZ(s32 value) 453 { 454 if (value < 0) 455 { 456 REGS.FLAG.sz1_otz_saturated = true; 457 value = 0; 458 } 459 else if (value > 0xFFFF) 460 { 461 REGS.FLAG.sz1_otz_saturated = true; 462 value = 0xFFFF; 463 } 464 465 REGS.dr32[16] = REGS.dr32[17]; // SZ0 <- SZ1 466 REGS.dr32[17] = REGS.dr32[18]; // SZ1 <- SZ2 467 REGS.dr32[18] = REGS.dr32[19]; // SZ2 <- SZ3 468 REGS.dr32[19] = static_cast<u32>(value); // SZ3 <- value 469 } 470 471 ALWAYS_INLINE void GTE::PushRGBFromMAC() 472 { 473 // Note: SHR 4 used instead of /16 as the results are different. 474 const u32 r = TruncateRGB<0>(static_cast<u32>(REGS.MAC1 >> 4)); 475 const u32 g = TruncateRGB<1>(static_cast<u32>(REGS.MAC2 >> 4)); 476 const u32 b = TruncateRGB<2>(static_cast<u32>(REGS.MAC3 >> 4)); 477 const u32 c = ZeroExtend32(REGS.RGBC[3]); 478 479 REGS.dr32[20] = REGS.dr32[21]; // RGB0 <- RGB1 480 REGS.dr32[21] = REGS.dr32[22]; // RGB1 <- RGB2 481 REGS.dr32[22] = r | (g << 8) | (b << 16) | (c << 24); // RGB2 <- Value 482 } 483 484 ALWAYS_INLINE u32 GTE::UNRDivide(u32 lhs, u32 rhs) 485 { 486 if (rhs * 2 <= lhs) 487 { 488 REGS.FLAG.divide_overflow = true; 489 return 0x1FFFF; 490 } 491 492 const u32 shift = (rhs == 0) ? 16 : CountLeadingZeros(static_cast<u16>(rhs)); 493 lhs <<= shift; 494 rhs <<= shift; 495 496 static constexpr std::array<u8, 257> unr_table = {{ 497 0xFF, 0xFD, 0xFB, 0xF9, 0xF7, 0xF5, 0xF3, 0xF1, 0xEF, 0xEE, 0xEC, 0xEA, 0xE8, 0xE6, 0xE4, 0xE3, // 498 0xE1, 0xDF, 0xDD, 0xDC, 0xDA, 0xD8, 0xD6, 0xD5, 0xD3, 0xD1, 0xD0, 0xCE, 0xCD, 0xCB, 0xC9, 0xC8, // 00h..3Fh 499 0xC6, 0xC5, 0xC3, 0xC1, 0xC0, 0xBE, 0xBD, 0xBB, 0xBA, 0xB8, 0xB7, 0xB5, 0xB4, 0xB2, 0xB1, 0xB0, // 500 0xAE, 0xAD, 0xAB, 0xAA, 0xA9, 0xA7, 0xA6, 0xA4, 0xA3, 0xA2, 0xA0, 0x9F, 0x9E, 0x9C, 0x9B, 0x9A, // 501 0x99, 0x97, 0x96, 0x95, 0x94, 0x92, 0x91, 0x90, 0x8F, 0x8D, 0x8C, 0x8B, 0x8A, 0x89, 0x87, 0x86, // 502 0x85, 0x84, 0x83, 0x82, 0x81, 0x7F, 0x7E, 0x7D, 0x7C, 0x7B, 0x7A, 0x79, 0x78, 0x77, 0x75, 0x74, // 40h..7Fh 503 0x73, 0x72, 0x71, 0x70, 0x6F, 0x6E, 0x6D, 0x6C, 0x6B, 0x6A, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64, // 504 0x63, 0x62, 0x61, 0x60, 0x5F, 0x5E, 0x5D, 0x5D, 0x5C, 0x5B, 0x5A, 0x59, 0x58, 0x57, 0x56, 0x55, // 505 0x54, 0x53, 0x53, 0x52, 0x51, 0x50, 0x4F, 0x4E, 0x4D, 0x4D, 0x4C, 0x4B, 0x4A, 0x49, 0x48, 0x48, // 506 0x47, 0x46, 0x45, 0x44, 0x43, 0x43, 0x42, 0x41, 0x40, 0x3F, 0x3F, 0x3E, 0x3D, 0x3C, 0x3C, 0x3B, // 80h..BFh 507 0x3A, 0x39, 0x39, 0x38, 0x37, 0x36, 0x36, 0x35, 0x34, 0x33, 0x33, 0x32, 0x31, 0x31, 0x30, 0x2F, // 508 0x2E, 0x2E, 0x2D, 0x2C, 0x2C, 0x2B, 0x2A, 0x2A, 0x29, 0x28, 0x28, 0x27, 0x26, 0x26, 0x25, 0x24, // 509 0x24, 0x23, 0x22, 0x22, 0x21, 0x20, 0x20, 0x1F, 0x1E, 0x1E, 0x1D, 0x1D, 0x1C, 0x1B, 0x1B, 0x1A, // 510 0x19, 0x19, 0x18, 0x18, 0x17, 0x16, 0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x11, // C0h..FFh 511 0x10, 0x0F, 0x0F, 0x0E, 0x0E, 0x0D, 0x0D, 0x0C, 0x0C, 0x0B, 0x0A, 0x0A, 0x09, 0x09, 0x08, 0x08, // 512 0x07, 0x07, 0x06, 0x06, 0x05, 0x05, 0x04, 0x04, 0x03, 0x03, 0x02, 0x02, 0x01, 0x01, 0x00, 0x00, // 513 0x00 // <-- one extra table entry (for "(d-7FC0h)/80h"=100h) 514 }}; 515 516 const u32 divisor = rhs | 0x8000; 517 const s32 x = static_cast<s32>(0x101 + ZeroExtend32(unr_table[((divisor & 0x7FFF) + 0x40) >> 7])); 518 const s32 d = ((static_cast<s32>(ZeroExtend32(divisor)) * -x) + 0x80) >> 8; 519 const u32 recip = static_cast<u32>(((x * (0x20000 + d)) + 0x80) >> 8); 520 521 const u32 result = Truncate32((ZeroExtend64(lhs) * ZeroExtend64(recip) + u64(0x8000)) >> 16); 522 523 // The min(1FFFFh) limit is needed for cases like FE3Fh/7F20h, F015h/780Bh, etc. (these do produce UNR result 20000h, 524 // and are saturated to 1FFFFh, but without setting overflow FLAG bits). 525 return std::min<u32>(0x1FFFF, result); 526 } 527 528 void GTE::MulMatVec(const s16* M_, const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm) 529 { 530 #define M(i, j) M_[((i) * 3) + (j)] 531 #define dot3(i) \ 532 TruncateAndSetMACAndIR<i + 1>(SignExtendMACResult<i + 1>((s64(M(i, 0)) * s64(Vx)) + (s64(M(i, 1)) * s64(Vy))) + \ 533 (s64(M(i, 2)) * s64(Vz)), \ 534 shift, lm) 535 536 dot3(0); 537 dot3(1); 538 dot3(2); 539 540 #undef dot3 541 #undef M 542 } 543 544 void GTE::MulMatVec(const s16* M_, const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm) 545 { 546 #define M(i, j) M_[((i) * 3) + (j)] 547 #define dot3(i) \ 548 TruncateAndSetMACAndIR<i + 1>( \ 549 SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>((s64(T[i]) << 12) + (s64(M(i, 0)) * s64(Vx))) + \ 550 (s64(M(i, 1)) * s64(Vy))) + \ 551 (s64(M(i, 2)) * s64(Vz)), \ 552 shift, lm) 553 554 dot3(0); 555 dot3(1); 556 dot3(2); 557 558 #undef dot3 559 #undef M 560 } 561 562 void GTE::MulMatVecBuggy(const s16* M_, const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm) 563 { 564 #define M(i, j) M_[((i) * 3) + (j)] 565 #define dot3(i) \ 566 do \ 567 { \ 568 TruncateAndSetIR<i + 1>(static_cast<s32>(SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>( \ 569 (s64(T[i]) << 12) + (s64(M(i, 0)) * s64(Vx)))) >> \ 570 shift), \ 571 false); \ 572 TruncateAndSetMACAndIR<i + 1>(SignExtendMACResult<i + 1>((s64(M(i, 1)) * s64(Vy))) + (s64(M(i, 2)) * s64(Vz)), \ 573 shift, lm); \ 574 } while (0) 575 576 dot3(0); 577 dot3(1); 578 dot3(2); 579 580 #undef dot3 581 #undef M 582 } 583 584 void GTE::Execute_MVMVA(Instruction inst) 585 { 586 REGS.FLAG.Clear(); 587 588 static constexpr const s16* M_lookup[4] = {®S.RT[0][0], ®S.LLM[0][0], ®S.LCM[0][0], nullptr}; 589 static constexpr const s16* V_lookup[4][3] = { 590 {®S.V0[0], ®S.V0[1], ®S.V0[2]}, 591 {®S.V1[0], ®S.V1[1], ®S.V1[2]}, 592 {®S.V2[0], ®S.V2[1], ®S.V2[2]}, 593 {®S.IR1, ®S.IR2, ®S.IR3}, 594 }; 595 static constexpr const s32 zero_T[3] = {}; 596 static constexpr const s32* T_lookup[4] = {REGS.TR, REGS.BK, REGS.FC, zero_T}; 597 598 const s16* M = M_lookup[inst.mvmva_multiply_matrix]; 599 const s16* const* const V = V_lookup[inst.mvmva_multiply_vector]; 600 const s32* const T = T_lookup[inst.mvmva_translation_vector]; 601 s16 buggy_M[3][3]; 602 603 if (!M) 604 { 605 // buggy 606 buggy_M[0][0] = -static_cast<s16>(ZeroExtend16(REGS.RGBC[0]) << 4); 607 buggy_M[0][1] = static_cast<s16>(ZeroExtend16(REGS.RGBC[0]) << 4); 608 buggy_M[0][2] = REGS.IR0; 609 buggy_M[1][0] = REGS.RT[0][2]; 610 buggy_M[1][1] = REGS.RT[0][2]; 611 buggy_M[1][2] = REGS.RT[0][2]; 612 buggy_M[2][0] = REGS.RT[1][1]; 613 buggy_M[2][1] = REGS.RT[1][1]; 614 buggy_M[2][2] = REGS.RT[1][1]; 615 M = &buggy_M[0][0]; 616 } 617 618 const s16 Vx = *V[0]; 619 const s16 Vy = *V[1]; 620 const s16 Vz = *V[2]; 621 if (inst.mvmva_translation_vector != 2) 622 MulMatVec(M, T, Vx, Vy, Vz, inst.GetShift(), inst.lm); 623 else 624 MulMatVecBuggy(M, T, Vx, Vy, Vz, inst.GetShift(), inst.lm); 625 626 REGS.FLAG.UpdateError(); 627 } 628 629 void GTE::Execute_SQR(Instruction inst) 630 { 631 REGS.FLAG.Clear(); 632 633 // 32-bit multiply for speed - 16x16 isn't >32bit, and we know it won't overflow/underflow. 634 const u8 shift = inst.GetShift(); 635 REGS.MAC1 = (s32(REGS.IR1) * s32(REGS.IR1)) >> shift; 636 REGS.MAC2 = (s32(REGS.IR2) * s32(REGS.IR2)) >> shift; 637 REGS.MAC3 = (s32(REGS.IR3) * s32(REGS.IR3)) >> shift; 638 639 const bool lm = inst.lm; 640 TruncateAndSetIR<1>(REGS.MAC1, lm); 641 TruncateAndSetIR<2>(REGS.MAC2, lm); 642 TruncateAndSetIR<3>(REGS.MAC3, lm); 643 644 REGS.FLAG.UpdateError(); 645 } 646 647 void GTE::Execute_OP(Instruction inst) 648 { 649 REGS.FLAG.Clear(); 650 651 // Take copies since we overwrite them in each step. 652 const u8 shift = inst.GetShift(); 653 const bool lm = inst.lm; 654 const s32 D1 = s32(REGS.RT[0][0]); 655 const s32 D2 = s32(REGS.RT[1][1]); 656 const s32 D3 = s32(REGS.RT[2][2]); 657 const s32 IR1 = s32(REGS.IR1); 658 const s32 IR2 = s32(REGS.IR2); 659 const s32 IR3 = s32(REGS.IR3); 660 661 // [MAC1,MAC2,MAC3] = [IR3*D2-IR2*D3, IR1*D3-IR3*D1, IR2*D1-IR1*D2] SAR (sf*12) 662 // [IR1, IR2, IR3] = [MAC1, MAC2, MAC3]; copy result 663 TruncateAndSetMACAndIR<1>(s64(IR3 * D2) - s64(IR2 * D3), shift, lm); 664 TruncateAndSetMACAndIR<2>(s64(IR1 * D3) - s64(IR3 * D1), shift, lm); 665 TruncateAndSetMACAndIR<3>(s64(IR2 * D1) - s64(IR1 * D2), shift, lm); 666 667 REGS.FLAG.UpdateError(); 668 } 669 670 void GTE::RTPS(const s16 V[3], u8 shift, bool lm, bool last) 671 { 672 #define dot3(i) \ 673 SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>((s64(REGS.TR[i]) << 12) + (s64(REGS.RT[i][0]) * s64(V[0]))) + \ 674 (s64(REGS.RT[i][1]) * s64(V[1]))) + \ 675 (s64(REGS.RT[i][2]) * s64(V[2])) 676 677 // IR1 = MAC1 = (TRX*1000h + RT11*VX0 + RT12*VY0 + RT13*VZ0) SAR (sf*12) 678 // IR2 = MAC2 = (TRY*1000h + RT21*VX0 + RT22*VY0 + RT23*VZ0) SAR (sf*12) 679 // IR3 = MAC3 = (TRZ*1000h + RT31*VX0 + RT32*VY0 + RT33*VZ0) SAR (sf*12) 680 const s64 x = dot3(0); 681 const s64 y = dot3(1); 682 const s64 z = dot3(2); 683 TruncateAndSetMAC<1>(x, shift); 684 TruncateAndSetMAC<2>(y, shift); 685 TruncateAndSetMAC<3>(z, shift); 686 TruncateAndSetIR<1>(REGS.MAC1, lm); 687 TruncateAndSetIR<2>(REGS.MAC2, lm); 688 689 // The command does saturate IR1,IR2,IR3 to -8000h..+7FFFh (regardless of lm bit). When using RTP with sf=0, then the 690 // IR3 saturation flag (FLAG.22) gets set <only> if "MAC3 SAR 12" exceeds -8000h..+7FFFh (although IR3 is saturated 691 // when "MAC3" exceeds -8000h..+7FFFh). 692 TruncateAndSetIR<3>(s32(z >> 12), false); 693 REGS.dr32[11] = std::clamp(REGS.MAC3, lm ? 0 : IR123_MIN_VALUE, IR123_MAX_VALUE); 694 #undef dot3 695 696 // SZ3 = MAC3 SAR ((1-sf)*12) ;ScreenZ FIFO 0..+FFFFh 697 PushSZ(s32(z >> 12)); 698 699 // MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh 700 // MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh 701 const s64 result = static_cast<s64>(ZeroExtend64(UNRDivide(REGS.H, REGS.SZ3))); 702 703 s64 Sx; 704 switch (s_config.aspect_ratio) 705 { 706 case DisplayAspectRatio::R16_9: 707 Sx = ((((s64(result) * s64(REGS.IR1)) * s64(3)) / s64(4)) + s64(REGS.OFX)); 708 break; 709 710 case DisplayAspectRatio::R19_9: 711 Sx = ((((s64(result) * s64(REGS.IR1)) * s64(12)) / s64(19)) + s64(REGS.OFX)); 712 break; 713 714 case DisplayAspectRatio::R20_9: 715 Sx = ((((s64(result) * s64(REGS.IR1)) * s64(3)) / s64(5)) + s64(REGS.OFX)); 716 break; 717 718 case DisplayAspectRatio::Custom: 719 case DisplayAspectRatio::MatchWindow: 720 Sx = ((((s64(result) * s64(REGS.IR1)) * s64(s_config.custom_aspect_ratio_numerator)) / 721 s64(s_config.custom_aspect_ratio_denominator)) + 722 s64(REGS.OFX)); 723 break; 724 725 case DisplayAspectRatio::Auto: 726 case DisplayAspectRatio::R4_3: 727 case DisplayAspectRatio::PAR1_1: 728 default: 729 Sx = (s64(result) * s64(REGS.IR1) + s64(REGS.OFX)); 730 break; 731 } 732 733 const s64 Sy = s64(result) * s64(REGS.IR2) + s64(REGS.OFY); 734 CheckMACOverflow<0>(Sx); 735 CheckMACOverflow<0>(Sy); 736 PushSXY(s32(Sx >> 16), s32(Sy >> 16)); 737 738 if (g_settings.gpu_pgxp_enable) 739 { 740 float precise_sz3, precise_ir1, precise_ir2; 741 742 if (g_settings.gpu_pgxp_preserve_proj_fp) 743 { 744 precise_sz3 = float(z) / 4096.0f; 745 precise_ir1 = float(x) / (static_cast<float>(1 << shift)); 746 precise_ir2 = float(y) / (static_cast<float>(1 << shift)); 747 if (lm) 748 { 749 precise_ir1 = std::clamp(precise_ir1, float(IR123_MIN_VALUE), float(IR123_MAX_VALUE)); 750 precise_ir2 = std::clamp(precise_ir2, float(IR123_MIN_VALUE), float(IR123_MAX_VALUE)); 751 } 752 else 753 { 754 precise_ir1 = std::min(precise_ir1, float(IR123_MAX_VALUE)); 755 precise_ir2 = std::min(precise_ir2, float(IR123_MAX_VALUE)); 756 } 757 } 758 else 759 { 760 precise_sz3 = float(REGS.SZ3); 761 precise_ir1 = float(REGS.IR1); 762 precise_ir2 = float(REGS.IR2); 763 } 764 765 // this can potentially use increased precision on Z 766 const float precise_z = std::max<float>(float(REGS.H) / 2.0f, precise_sz3); 767 const float precise_h_div_sz = float(REGS.H) / precise_z; 768 const float fofx = float(REGS.OFX) / float(1 << 16); 769 const float fofy = float(REGS.OFY) / float(1 << 16); 770 float precise_x = precise_ir1 * precise_h_div_sz; 771 772 switch (s_config.aspect_ratio) 773 { 774 case DisplayAspectRatio::MatchWindow: 775 case DisplayAspectRatio::Custom: 776 precise_x = precise_x * s_config.custom_aspect_ratio_f; 777 break; 778 779 case DisplayAspectRatio::R16_9: 780 precise_x = (precise_x * 3.0f) / 4.0f; 781 break; 782 783 case DisplayAspectRatio::R19_9: 784 precise_x = (precise_x * 12.0f) / 19.0f; 785 break; 786 787 case DisplayAspectRatio::R20_9: 788 precise_x = (precise_x * 3.0f) / 5.0f; 789 break; 790 791 case DisplayAspectRatio::Auto: 792 case DisplayAspectRatio::R4_3: 793 case DisplayAspectRatio::PAR1_1: 794 default: 795 break; 796 } 797 798 precise_x += fofx; 799 800 float precise_y = fofy + (precise_ir2 * precise_h_div_sz); 801 802 precise_x = std::clamp<float>(precise_x, -1024.0f, 1023.0f); 803 precise_y = std::clamp<float>(precise_y, -1024.0f, 1023.0f); 804 CPU::PGXP::GTE_RTPS(precise_x, precise_y, precise_z, REGS.dr32[14]); 805 } 806 807 if (last) 808 { 809 // MAC0=(((H*20000h/SZ3)+1)/2)*DQA+DQB, IR0=MAC0/1000h ;Depth cueing 0..+1000h 810 const s64 Sz = s64(result) * s64(REGS.DQA) + s64(REGS.DQB); 811 TruncateAndSetMAC<0>(Sz, 0); 812 TruncateAndSetIR<0>(s32(Sz >> 12), true); 813 } 814 } 815 816 void GTE::Execute_RTPS(Instruction inst) 817 { 818 REGS.FLAG.Clear(); 819 RTPS(REGS.V0, inst.GetShift(), inst.lm, true); 820 REGS.FLAG.UpdateError(); 821 } 822 823 void GTE::Execute_RTPT(Instruction inst) 824 { 825 REGS.FLAG.Clear(); 826 827 const u8 shift = inst.GetShift(); 828 const bool lm = inst.lm; 829 830 RTPS(REGS.V0, shift, lm, false); 831 RTPS(REGS.V1, shift, lm, false); 832 RTPS(REGS.V2, shift, lm, true); 833 834 REGS.FLAG.UpdateError(); 835 } 836 837 void GTE::Execute_NCLIP(Instruction inst) 838 { 839 // MAC0 = SX0*SY1 + SX1*SY2 + SX2*SY0 - SX0*SY2 - SX1*SY0 - SX2*SY1 840 REGS.FLAG.Clear(); 841 842 TruncateAndSetMAC<0>(s64(REGS.SXY0[0]) * s64(REGS.SXY1[1]) + s64(REGS.SXY1[0]) * s64(REGS.SXY2[1]) + 843 s64(REGS.SXY2[0]) * s64(REGS.SXY0[1]) - s64(REGS.SXY0[0]) * s64(REGS.SXY2[1]) - 844 s64(REGS.SXY1[0]) * s64(REGS.SXY0[1]) - s64(REGS.SXY2[0]) * s64(REGS.SXY1[1]), 845 0); 846 847 REGS.FLAG.UpdateError(); 848 } 849 850 void GTE::Execute_NCLIP_PGXP(Instruction inst) 851 { 852 if (CPU::PGXP::GTE_HasPreciseVertices(REGS.dr32[12], REGS.dr32[13], REGS.dr32[14])) 853 { 854 REGS.FLAG.Clear(); 855 REGS.MAC0 = static_cast<s32>(CPU::PGXP::GTE_NCLIP()); 856 } 857 else 858 { 859 Execute_NCLIP(inst); 860 } 861 } 862 863 void GTE::Execute_AVSZ3(Instruction inst) 864 { 865 REGS.FLAG.Clear(); 866 867 const s64 result = s64(REGS.ZSF3) * s32(u32(REGS.SZ1) + u32(REGS.SZ2) + u32(REGS.SZ3)); 868 TruncateAndSetMAC<0>(result, 0); 869 SetOTZ(s32(result >> 12)); 870 871 REGS.FLAG.UpdateError(); 872 } 873 874 void GTE::Execute_AVSZ4(Instruction inst) 875 { 876 REGS.FLAG.Clear(); 877 878 const s64 result = s64(REGS.ZSF4) * s32(u32(REGS.SZ0) + u32(REGS.SZ1) + u32(REGS.SZ2) + u32(REGS.SZ3)); 879 TruncateAndSetMAC<0>(result, 0); 880 SetOTZ(s32(result >> 12)); 881 882 REGS.FLAG.UpdateError(); 883 } 884 885 ALWAYS_INLINE void GTE::InterpolateColor(s64 in_MAC1, s64 in_MAC2, s64 in_MAC3, u8 shift, bool lm) 886 { 887 // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 888 // [IR1,IR2,IR3] = (([RFC,GFC,BFC] SHL 12) - [MAC1,MAC2,MAC3]) SAR (sf*12) 889 TruncateAndSetMACAndIR<1>((s64(REGS.FC[0]) << 12) - in_MAC1, shift, false); 890 TruncateAndSetMACAndIR<2>((s64(REGS.FC[1]) << 12) - in_MAC2, shift, false); 891 TruncateAndSetMACAndIR<3>((s64(REGS.FC[2]) << 12) - in_MAC3, shift, false); 892 893 // [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3]) 894 // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12) 895 TruncateAndSetMACAndIR<1>(s64(s32(REGS.IR1) * s32(REGS.IR0)) + in_MAC1, shift, lm); 896 TruncateAndSetMACAndIR<2>(s64(s32(REGS.IR2) * s32(REGS.IR0)) + in_MAC2, shift, lm); 897 TruncateAndSetMACAndIR<3>(s64(s32(REGS.IR3) * s32(REGS.IR0)) + in_MAC3, shift, lm); 898 } 899 900 void GTE::NCS(const s16 V[3], u8 shift, bool lm) 901 { 902 // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12) 903 MulMatVec(®S.LLM[0][0], V[0], V[1], V[2], shift, lm); 904 905 // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12) 906 MulMatVec(®S.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm); 907 908 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 909 PushRGBFromMAC(); 910 } 911 912 void GTE::Execute_NCS(Instruction inst) 913 { 914 REGS.FLAG.Clear(); 915 916 NCS(REGS.V0, inst.GetShift(), inst.lm); 917 918 REGS.FLAG.UpdateError(); 919 } 920 921 void GTE::Execute_NCT(Instruction inst) 922 { 923 REGS.FLAG.Clear(); 924 925 const u8 shift = inst.GetShift(); 926 const bool lm = inst.lm; 927 928 NCS(REGS.V0, shift, lm); 929 NCS(REGS.V1, shift, lm); 930 NCS(REGS.V2, shift, lm); 931 932 REGS.FLAG.UpdateError(); 933 } 934 935 void GTE::NCCS(const s16 V[3], u8 shift, bool lm) 936 { 937 // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12) 938 MulMatVec(®S.LLM[0][0], V[0], V[1], V[2], shift, lm); 939 940 // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12) 941 MulMatVec(®S.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm); 942 943 // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4 ;<--- for NCDx/NCCx 944 // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12) ;<--- for NCDx/NCCx 945 TruncateAndSetMACAndIR<1>(s64(s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4, shift, lm); 946 TruncateAndSetMACAndIR<2>(s64(s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4, shift, lm); 947 TruncateAndSetMACAndIR<3>(s64(s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4, shift, lm); 948 949 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 950 PushRGBFromMAC(); 951 } 952 953 void GTE::Execute_NCCS(Instruction inst) 954 { 955 REGS.FLAG.Clear(); 956 957 NCCS(REGS.V0, inst.GetShift(), inst.lm); 958 959 REGS.FLAG.UpdateError(); 960 } 961 962 void GTE::Execute_NCCT(Instruction inst) 963 { 964 REGS.FLAG.Clear(); 965 966 const u8 shift = inst.GetShift(); 967 const bool lm = inst.lm; 968 969 NCCS(REGS.V0, shift, lm); 970 NCCS(REGS.V1, shift, lm); 971 NCCS(REGS.V2, shift, lm); 972 973 REGS.FLAG.UpdateError(); 974 } 975 976 void GTE::NCDS(const s16 V[3], u8 shift, bool lm) 977 { 978 // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12) 979 MulMatVec(®S.LLM[0][0], V[0], V[1], V[2], shift, lm); 980 981 // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12) 982 MulMatVec(®S.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm); 983 984 // No need to assign these to MAC[1-3], as it'll never overflow. 985 // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4 ;<--- for NCDx/NCCx 986 const s32 in_MAC1 = (s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4; 987 const s32 in_MAC2 = (s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4; 988 const s32 in_MAC3 = (s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4; 989 990 // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 ;<--- for NCDx only 991 InterpolateColor(in_MAC1, in_MAC2, in_MAC3, shift, lm); 992 993 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 994 PushRGBFromMAC(); 995 } 996 997 void GTE::Execute_NCDS(Instruction inst) 998 { 999 REGS.FLAG.Clear(); 1000 1001 NCDS(REGS.V0, inst.GetShift(), inst.lm); 1002 1003 REGS.FLAG.UpdateError(); 1004 } 1005 1006 void GTE::Execute_NCDT(Instruction inst) 1007 { 1008 REGS.FLAG.Clear(); 1009 1010 const u8 shift = inst.GetShift(); 1011 const bool lm = inst.lm; 1012 1013 NCDS(REGS.V0, shift, lm); 1014 NCDS(REGS.V1, shift, lm); 1015 NCDS(REGS.V2, shift, lm); 1016 1017 REGS.FLAG.UpdateError(); 1018 } 1019 1020 void GTE::Execute_CC(Instruction inst) 1021 { 1022 REGS.FLAG.Clear(); 1023 1024 const u8 shift = inst.GetShift(); 1025 const bool lm = inst.lm; 1026 1027 // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12) 1028 MulMatVec(®S.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm); 1029 1030 // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4 1031 // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12) 1032 TruncateAndSetMACAndIR<1>(s64(s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4, shift, lm); 1033 TruncateAndSetMACAndIR<2>(s64(s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4, shift, lm); 1034 TruncateAndSetMACAndIR<3>(s64(s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4, shift, lm); 1035 1036 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 1037 PushRGBFromMAC(); 1038 1039 REGS.FLAG.UpdateError(); 1040 } 1041 1042 void GTE::Execute_CDP(Instruction inst) 1043 { 1044 REGS.FLAG.Clear(); 1045 1046 const u8 shift = inst.GetShift(); 1047 const bool lm = inst.lm; 1048 1049 // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12) 1050 MulMatVec(®S.LCM[0][0], REGS.BK, REGS.IR1, REGS.IR2, REGS.IR3, shift, lm); 1051 1052 // No need to assign these to MAC[1-3], as it'll never overflow. 1053 // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4 1054 const s32 in_MAC1 = (s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4; 1055 const s32 in_MAC2 = (s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4; 1056 const s32 in_MAC3 = (s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4; 1057 1058 // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 ;<--- for CDP only 1059 // [MAC1, MAC2, MAC3] = [MAC1, MAC2, MAC3] SAR(sf * 12) 1060 InterpolateColor(in_MAC1, in_MAC2, in_MAC3, shift, lm); 1061 1062 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 1063 PushRGBFromMAC(); 1064 1065 REGS.FLAG.UpdateError(); 1066 } 1067 1068 void GTE::DPCS(const u8 color[3], u8 shift, bool lm) 1069 { 1070 // In: [IR1,IR2,IR3]=Vector, FC=Far Color, IR0=Interpolation value, CODE=MSB of RGBC 1071 // [MAC1,MAC2,MAC3] = [R,G,B] SHL 16 ;<--- for DPCS/DPCT 1072 TruncateAndSetMAC<1>((s64(ZeroExtend64(color[0])) << 16), 0); 1073 TruncateAndSetMAC<2>((s64(ZeroExtend64(color[1])) << 16), 0); 1074 TruncateAndSetMAC<3>((s64(ZeroExtend64(color[2])) << 16), 0); 1075 1076 // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 1077 InterpolateColor(REGS.MAC1, REGS.MAC2, REGS.MAC3, shift, lm); 1078 1079 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 1080 PushRGBFromMAC(); 1081 } 1082 1083 void GTE::Execute_DPCS(Instruction inst) 1084 { 1085 REGS.FLAG.Clear(); 1086 1087 DPCS(REGS.RGBC, inst.GetShift(), inst.lm); 1088 1089 REGS.FLAG.UpdateError(); 1090 } 1091 1092 void GTE::Execute_DPCT(Instruction inst) 1093 { 1094 REGS.FLAG.Clear(); 1095 1096 const u8 shift = inst.GetShift(); 1097 const bool lm = inst.lm; 1098 1099 for (u32 i = 0; i < 3; i++) 1100 DPCS(REGS.RGB0, shift, lm); 1101 1102 REGS.FLAG.UpdateError(); 1103 } 1104 1105 void GTE::Execute_DCPL(Instruction inst) 1106 { 1107 REGS.FLAG.Clear(); 1108 1109 const u8 shift = inst.GetShift(); 1110 const bool lm = inst.lm; 1111 1112 // No need to assign these to MAC[1-3], as it'll never overflow. 1113 // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4 ;<--- for DCPL only 1114 const s32 in_MAC1 = (s32(ZeroExtend32(REGS.RGBC[0])) * s32(REGS.IR1)) << 4; 1115 const s32 in_MAC2 = (s32(ZeroExtend32(REGS.RGBC[1])) * s32(REGS.IR2)) << 4; 1116 const s32 in_MAC3 = (s32(ZeroExtend32(REGS.RGBC[2])) * s32(REGS.IR3)) << 4; 1117 1118 // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 1119 InterpolateColor(in_MAC1, in_MAC2, in_MAC3, shift, lm); 1120 1121 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 1122 PushRGBFromMAC(); 1123 1124 REGS.FLAG.UpdateError(); 1125 } 1126 1127 void GTE::Execute_INTPL(Instruction inst) 1128 { 1129 REGS.FLAG.Clear(); 1130 1131 const u8 shift = inst.GetShift(); 1132 const bool lm = inst.lm; 1133 1134 // No need to assign these to MAC[1-3], as it'll never overflow. 1135 // [MAC1,MAC2,MAC3] = [IR1,IR2,IR3] SHL 12 ;<--- for INTPL only 1136 // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 1137 InterpolateColor(s32(REGS.IR1) << 12, s32(REGS.IR2) << 12, s32(REGS.IR3) << 12, shift, lm); 1138 1139 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 1140 PushRGBFromMAC(); 1141 1142 REGS.FLAG.UpdateError(); 1143 } 1144 1145 void GTE::Execute_GPL(Instruction inst) 1146 { 1147 REGS.FLAG.Clear(); 1148 1149 const u8 shift = inst.GetShift(); 1150 const bool lm = inst.lm; 1151 1152 // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SHL (sf*12) ;<--- for GPL only 1153 // [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3]) SAR (sf*12) 1154 TruncateAndSetMACAndIR<1>((s64(s32(REGS.IR1) * s32(REGS.IR0)) + (s64(REGS.MAC1) << shift)), shift, lm); 1155 TruncateAndSetMACAndIR<2>((s64(s32(REGS.IR2) * s32(REGS.IR0)) + (s64(REGS.MAC2) << shift)), shift, lm); 1156 TruncateAndSetMACAndIR<3>((s64(s32(REGS.IR3) * s32(REGS.IR0)) + (s64(REGS.MAC3) << shift)), shift, lm); 1157 1158 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 1159 PushRGBFromMAC(); 1160 1161 REGS.FLAG.UpdateError(); 1162 } 1163 1164 void GTE::Execute_GPF(Instruction inst) 1165 { 1166 REGS.FLAG.Clear(); 1167 1168 const u8 shift = inst.GetShift(); 1169 const bool lm = inst.lm; 1170 1171 // [MAC1,MAC2,MAC3] = [0,0,0] ;<--- for GPF only 1172 // [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3]) SAR (sf*12) 1173 TruncateAndSetMACAndIR<1>(s64(s32(REGS.IR1) * s32(REGS.IR0)), shift, lm); 1174 TruncateAndSetMACAndIR<2>(s64(s32(REGS.IR2) * s32(REGS.IR0)), shift, lm); 1175 TruncateAndSetMACAndIR<3>(s64(s32(REGS.IR3) * s32(REGS.IR0)), shift, lm); 1176 1177 // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] 1178 PushRGBFromMAC(); 1179 1180 REGS.FLAG.UpdateError(); 1181 } 1182 1183 void GTE::ExecuteInstruction(u32 inst_bits) 1184 { 1185 const Instruction inst{inst_bits}; 1186 switch (inst.command) 1187 { 1188 case 0x01: 1189 CPU::AddGTETicks(15); 1190 Execute_RTPS(inst); 1191 break; 1192 1193 case 0x06: 1194 { 1195 CPU::AddGTETicks(8); 1196 if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling) 1197 Execute_NCLIP_PGXP(inst); 1198 else 1199 Execute_NCLIP(inst); 1200 } 1201 break; 1202 1203 case 0x0C: 1204 CPU::AddGTETicks(6); 1205 Execute_OP(inst); 1206 break; 1207 1208 case 0x10: 1209 CPU::AddGTETicks(8); 1210 Execute_DPCS(inst); 1211 break; 1212 1213 case 0x11: 1214 CPU::AddGTETicks(7); 1215 Execute_INTPL(inst); 1216 break; 1217 1218 case 0x12: 1219 CPU::AddGTETicks(8); 1220 Execute_MVMVA(inst); 1221 break; 1222 1223 case 0x13: 1224 CPU::AddGTETicks(19); 1225 Execute_NCDS(inst); 1226 break; 1227 1228 case 0x14: 1229 CPU::AddGTETicks(13); 1230 Execute_CDP(inst); 1231 break; 1232 1233 case 0x16: 1234 CPU::AddGTETicks(44); 1235 Execute_NCDT(inst); 1236 break; 1237 1238 case 0x1B: 1239 CPU::AddGTETicks(17); 1240 Execute_NCCS(inst); 1241 break; 1242 1243 case 0x1C: 1244 CPU::AddGTETicks(11); 1245 Execute_CC(inst); 1246 break; 1247 1248 case 0x1E: 1249 CPU::AddGTETicks(14); 1250 Execute_NCS(inst); 1251 break; 1252 1253 case 0x20: 1254 CPU::AddGTETicks(30); 1255 Execute_NCT(inst); 1256 break; 1257 1258 case 0x28: 1259 CPU::AddGTETicks(5); 1260 Execute_SQR(inst); 1261 break; 1262 1263 case 0x29: 1264 CPU::AddGTETicks(8); 1265 Execute_DCPL(inst); 1266 break; 1267 1268 case 0x2A: 1269 CPU::AddGTETicks(17); 1270 Execute_DPCT(inst); 1271 break; 1272 1273 case 0x2D: 1274 CPU::AddGTETicks(5); 1275 Execute_AVSZ3(inst); 1276 break; 1277 1278 case 0x2E: 1279 CPU::AddGTETicks(6); 1280 Execute_AVSZ4(inst); 1281 break; 1282 1283 case 0x30: 1284 CPU::AddGTETicks(23); 1285 Execute_RTPT(inst); 1286 break; 1287 1288 case 0x3D: 1289 CPU::AddGTETicks(5); 1290 Execute_GPF(inst); 1291 break; 1292 1293 case 0x3E: 1294 CPU::AddGTETicks(5); 1295 Execute_GPL(inst); 1296 break; 1297 1298 case 0x3F: 1299 CPU::AddGTETicks(39); 1300 Execute_NCCT(inst); 1301 break; 1302 1303 default: 1304 Panic("Missing handler"); 1305 break; 1306 } 1307 } 1308 1309 GTE::InstructionImpl GTE::GetInstructionImpl(u32 inst_bits, TickCount* ticks) 1310 { 1311 const Instruction inst{inst_bits}; 1312 switch (inst.command) 1313 { 1314 case 0x01: 1315 *ticks = 15; 1316 return &Execute_RTPS; 1317 1318 case 0x06: 1319 { 1320 *ticks = 8; 1321 if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling) 1322 return &Execute_NCLIP_PGXP; 1323 else 1324 return &Execute_NCLIP; 1325 } 1326 1327 case 0x0C: 1328 *ticks = 6; 1329 return &Execute_OP; 1330 1331 case 0x10: 1332 *ticks = 8; 1333 return &Execute_DPCS; 1334 1335 case 0x11: 1336 *ticks = 7; 1337 return &Execute_INTPL; 1338 1339 case 0x12: 1340 *ticks = 8; 1341 return &Execute_MVMVA; 1342 1343 case 0x13: 1344 *ticks = 19; 1345 return &Execute_NCDS; 1346 1347 case 0x14: 1348 *ticks = 13; 1349 return &Execute_CDP; 1350 1351 case 0x16: 1352 *ticks = 44; 1353 return &Execute_NCDT; 1354 1355 case 0x1B: 1356 *ticks = 17; 1357 return &Execute_NCCS; 1358 1359 case 0x1C: 1360 *ticks = 11; 1361 return &Execute_CC; 1362 1363 case 0x1E: 1364 *ticks = 14; 1365 return &Execute_NCS; 1366 1367 case 0x20: 1368 *ticks = 30; 1369 return &Execute_NCT; 1370 1371 case 0x28: 1372 *ticks = 5; 1373 return &Execute_SQR; 1374 1375 case 0x29: 1376 *ticks = 8; 1377 return &Execute_DCPL; 1378 1379 case 0x2A: 1380 *ticks = 17; 1381 return &Execute_DPCT; 1382 1383 case 0x2D: 1384 *ticks = 5; 1385 return &Execute_AVSZ3; 1386 1387 case 0x2E: 1388 *ticks = 6; 1389 return &Execute_AVSZ4; 1390 1391 case 0x30: 1392 *ticks = 23; 1393 return &Execute_RTPT; 1394 1395 case 0x3D: 1396 *ticks = 5; 1397 return &Execute_GPF; 1398 1399 case 0x3E: 1400 *ticks = 5; 1401 return &Execute_GPL; 1402 1403 case 0x3F: 1404 *ticks = 39; 1405 return &Execute_NCCT; 1406 1407 default: 1408 Panic("Missing handler"); 1409 } 1410 }