mdec.cpp (34332B)
1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com> 2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) 3 4 #include "mdec.h" 5 #include "cpu_core.h" 6 #include "dma.h" 7 #include "system.h" 8 #include "timing_event.h" 9 10 #include "util/imgui_manager.h" 11 #include "util/state_wrapper.h" 12 13 #include "common/bitfield.h" 14 #include "common/fifo_queue.h" 15 #include "common/gsvector.h" 16 #include "common/log.h" 17 18 #include "imgui.h" 19 20 #include <array> 21 #include <memory> 22 23 Log_SetChannel(MDEC); 24 25 namespace MDEC { 26 namespace { 27 28 static constexpr u32 DATA_IN_FIFO_SIZE = 1024; 29 static constexpr u32 DATA_OUT_FIFO_SIZE = 768; 30 static constexpr u32 NUM_BLOCKS = 6; 31 static constexpr TickCount TICKS_PER_BLOCK = 448; 32 33 enum DataOutputDepth : u8 34 { 35 DataOutputDepth_4Bit = 0, 36 DataOutputDepth_8Bit = 1, 37 DataOutputDepth_24Bit = 2, 38 DataOutputDepth_15Bit = 3 39 }; 40 41 enum class Command : u8 42 { 43 None = 0, 44 DecodeMacroblock = 1, 45 SetIqTab = 2, 46 SetScale = 3 47 }; 48 49 enum class State : u8 50 { 51 Idle, 52 DecodingMacroblock, 53 WritingMacroblock, 54 SetIqTable, 55 SetScaleTable, 56 NoCommand 57 }; 58 59 union StatusRegister 60 { 61 u32 bits; 62 63 BitField<u32, bool, 31, 1> data_out_fifo_empty; 64 BitField<u32, bool, 30, 1> data_in_fifo_full; 65 BitField<u32, bool, 29, 1> command_busy; 66 BitField<u32, bool, 28, 1> data_in_request; 67 BitField<u32, bool, 27, 1> data_out_request; 68 BitField<u32, DataOutputDepth, 25, 2> data_output_depth; 69 BitField<u32, bool, 24, 1> data_output_signed; 70 BitField<u32, u8, 23, 1> data_output_bit15; 71 BitField<u32, u8, 16, 3> current_block; 72 BitField<u32, u16, 0, 16> parameter_words_remaining; 73 }; 74 75 union ControlRegister 76 { 77 u32 bits; 78 BitField<u32, bool, 31, 1> reset; 79 BitField<u32, bool, 30, 1> enable_dma_in; 80 BitField<u32, bool, 29, 1> enable_dma_out; 81 }; 82 83 union CommandWord 84 { 85 u32 bits; 86 87 BitField<u32, Command, 29, 3> command; 88 BitField<u32, DataOutputDepth, 27, 2> data_output_depth; 89 BitField<u32, bool, 26, 1> data_output_signed; 90 BitField<u32, u8, 25, 1> data_output_bit15; 91 BitField<u32, u16, 0, 16> parameter_word_count; 92 }; 93 94 } // namespace 95 96 static bool HasPendingBlockCopyOut(); 97 98 static void SoftReset(); 99 static void ResetDecoder(); 100 static void UpdateStatus(); 101 102 static u32 ReadDataRegister(); 103 static void WriteCommandRegister(u32 value); 104 static void Execute(); 105 106 static bool HandleDecodeMacroblockCommand(); 107 static void HandleSetQuantTableCommand(); 108 static void HandleSetScaleCommand(); 109 110 static void SetScaleMatrix(const u16* values); 111 static bool DecodeMonoMacroblock(); 112 static bool DecodeColoredMacroblock(); 113 static void ScheduleBlockCopyOut(TickCount ticks); 114 static void CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late); 115 116 static bool DecodeRLE_Old(s16* blk, const u8* qt); 117 static void IDCT_Old(s16* blk); 118 static void YUVToRGB_Old(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk, 119 const std::array<s16, 64>& Yblk); 120 121 static bool DecodeRLE_New(s16* blk, const u8* qt); 122 static void IDCT_New(s16* blk); 123 static void YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk, 124 const std::array<s16, 64>& Yblk); 125 126 static void YUVToMono(const std::array<s16, 64>& Yblk); 127 128 namespace { 129 struct MDECState 130 { 131 StatusRegister status = {}; 132 bool enable_dma_in = false; 133 bool enable_dma_out = false; 134 135 // Even though the DMA is in words, we access the FIFO as halfwords. 136 InlineFIFOQueue<u16, DATA_IN_FIFO_SIZE / sizeof(u16)> data_in_fifo; 137 InlineFIFOQueue<u32, DATA_OUT_FIFO_SIZE / sizeof(u32)> data_out_fifo; 138 State state = State::Idle; 139 u32 remaining_halfwords = 0; 140 141 std::array<u8, 64> iq_uv{}; 142 std::array<u8, 64> iq_y{}; 143 144 alignas(VECTOR_ALIGNMENT) std::array<s16, 64> scale_table{}; 145 146 // blocks, for colour: 0 - Crblk, 1 - Cbblk, 2-5 - Y 1-4 147 alignas(VECTOR_ALIGNMENT) std::array<std::array<s16, 64>, NUM_BLOCKS> blocks; 148 u32 current_block = 0; // block (0-5) 149 u32 current_coefficient = 64; // k (in block) 150 u16 current_q_scale = 0; 151 152 alignas(16) std::array<u32, 256> block_rgb{}; 153 TimingEvent block_copy_out_event{"MDEC Block Copy Out", 1, 1, &MDEC::CopyOutBlock, nullptr}; 154 155 u32 total_blocks_decoded = 0; 156 }; 157 } // namespace 158 159 ALIGN_TO_CACHE_LINE static MDECState s_state; 160 } // namespace MDEC 161 162 void MDEC::Initialize() 163 { 164 s_state.total_blocks_decoded = 0; 165 Reset(); 166 } 167 168 void MDEC::Shutdown() 169 { 170 s_state.block_copy_out_event.Deactivate(); 171 } 172 173 void MDEC::Reset() 174 { 175 s_state.block_copy_out_event.Deactivate(); 176 SoftReset(); 177 } 178 179 bool MDEC::DoState(StateWrapper& sw) 180 { 181 sw.Do(&s_state.status.bits); 182 sw.Do(&s_state.enable_dma_in); 183 sw.Do(&s_state.enable_dma_out); 184 sw.Do(&s_state.data_in_fifo); 185 sw.Do(&s_state.data_out_fifo); 186 sw.Do(&s_state.state); 187 sw.Do(&s_state.remaining_halfwords); 188 sw.Do(&s_state.iq_uv); 189 sw.Do(&s_state.iq_y); 190 191 if (sw.GetVersion() < 66) [[unlikely]] 192 { 193 std::array<u16, 64> old_scale_matrix; 194 sw.Do(&old_scale_matrix); 195 SetScaleMatrix(old_scale_matrix.data()); 196 } 197 else 198 { 199 sw.Do(&s_state.scale_table); 200 } 201 202 sw.Do(&s_state.blocks); 203 sw.Do(&s_state.current_block); 204 sw.Do(&s_state.current_coefficient); 205 sw.Do(&s_state.current_q_scale); 206 sw.Do(&s_state.block_rgb); 207 208 bool block_copy_out_pending = HasPendingBlockCopyOut(); 209 sw.Do(&block_copy_out_pending); 210 if (sw.IsReading()) 211 s_state.block_copy_out_event.SetState(block_copy_out_pending); 212 213 return !sw.HasError(); 214 } 215 216 u32 MDEC::ReadRegister(u32 offset) 217 { 218 switch (offset) 219 { 220 case 0: 221 return ReadDataRegister(); 222 223 case 4: 224 { 225 TRACE_LOG("MDEC status register -> 0x{:08X}", s_state.status.bits); 226 return s_state.status.bits; 227 } 228 229 [[unlikely]] default: 230 { 231 ERROR_LOG("Unknown MDEC register read: 0x{:08X}", offset); 232 return UINT32_C(0xFFFFFFFF); 233 } 234 } 235 } 236 237 void MDEC::WriteRegister(u32 offset, u32 value) 238 { 239 switch (offset) 240 { 241 case 0: 242 { 243 WriteCommandRegister(value); 244 return; 245 } 246 247 case 4: 248 { 249 DEBUG_LOG("MDEC control register <- 0x{:08X}", value); 250 251 const ControlRegister cr{value}; 252 if (cr.reset) 253 SoftReset(); 254 255 s_state.enable_dma_in = cr.enable_dma_in; 256 s_state.enable_dma_out = cr.enable_dma_out; 257 Execute(); 258 return; 259 } 260 261 [[unlikely]] default: 262 { 263 ERROR_LOG("Unknown MDEC register write: 0x{:08X} <- 0x{:08X}", offset, value); 264 return; 265 } 266 } 267 } 268 269 void MDEC::DMARead(u32* words, u32 word_count) 270 { 271 if (s_state.data_out_fifo.GetSize() < word_count) [[unlikely]] 272 { 273 WARNING_LOG("Insufficient data in output FIFO (requested {}, have {})", word_count, 274 s_state.data_out_fifo.GetSize()); 275 } 276 277 const u32 words_to_read = std::min(word_count, s_state.data_out_fifo.GetSize()); 278 if (words_to_read > 0) 279 { 280 s_state.data_out_fifo.PopRange(words, words_to_read); 281 words += words_to_read; 282 word_count -= words_to_read; 283 } 284 285 DEBUG_LOG("DMA read complete, {} bytes left", s_state.data_out_fifo.GetSize() * sizeof(u32)); 286 if (s_state.data_out_fifo.IsEmpty()) 287 Execute(); 288 } 289 290 void MDEC::DMAWrite(const u32* words, u32 word_count) 291 { 292 if (s_state.data_in_fifo.GetSpace() < (word_count * 2)) [[unlikely]] 293 { 294 WARNING_LOG("Input FIFO overflow (writing {}, space {})", word_count * 2, s_state.data_in_fifo.GetSpace()); 295 } 296 297 const u32 halfwords_to_write = std::min(word_count * 2, s_state.data_in_fifo.GetSpace() & ~u32(2)); 298 s_state.data_in_fifo.PushRange(reinterpret_cast<const u16*>(words), halfwords_to_write); 299 Execute(); 300 } 301 302 bool MDEC::HasPendingBlockCopyOut() 303 { 304 return s_state.block_copy_out_event.IsActive(); 305 } 306 307 void MDEC::SoftReset() 308 { 309 s_state.status.bits = 0; 310 s_state.enable_dma_in = false; 311 s_state.enable_dma_out = false; 312 s_state.data_in_fifo.Clear(); 313 s_state.data_out_fifo.Clear(); 314 s_state.state = State::Idle; 315 s_state.remaining_halfwords = 0; 316 s_state.current_block = 0; 317 s_state.current_coefficient = 64; 318 s_state.current_q_scale = 0; 319 s_state.block_copy_out_event.Deactivate(); 320 UpdateStatus(); 321 } 322 323 void MDEC::ResetDecoder() 324 { 325 s_state.current_block = 0; 326 s_state.current_coefficient = 64; 327 s_state.current_q_scale = 0; 328 } 329 330 void MDEC::UpdateStatus() 331 { 332 s_state.status.data_out_fifo_empty = s_state.data_out_fifo.IsEmpty(); 333 s_state.status.data_in_fifo_full = s_state.data_in_fifo.IsFull(); 334 335 s_state.status.command_busy = (s_state.state != State::Idle); 336 s_state.status.parameter_words_remaining = Truncate16((s_state.remaining_halfwords / 2) - 1); 337 s_state.status.current_block = (s_state.current_block + 4) % NUM_BLOCKS; 338 339 // we always want data in if it's enabled 340 const bool data_in_request = s_state.enable_dma_in && s_state.data_in_fifo.GetSpace() >= (32 * 2); 341 s_state.status.data_in_request = data_in_request; 342 DMA::SetRequest(DMA::Channel::MDECin, data_in_request); 343 344 // we only want to send data out if we have some in the fifo 345 const bool data_out_request = s_state.enable_dma_out && !s_state.data_out_fifo.IsEmpty(); 346 s_state.status.data_out_request = data_out_request; 347 DMA::SetRequest(DMA::Channel::MDECout, data_out_request); 348 } 349 350 u32 MDEC::ReadDataRegister() 351 { 352 if (s_state.data_out_fifo.IsEmpty()) 353 { 354 // Stall the CPU until we're done processing. 355 if (HasPendingBlockCopyOut()) 356 { 357 DEV_LOG("MDEC data out FIFO empty on read - stalling CPU"); 358 CPU::AddPendingTicks(s_state.block_copy_out_event.GetTicksUntilNextExecution()); 359 } 360 else 361 { 362 WARNING_LOG("MDEC data out FIFO empty on read and no data processing"); 363 return UINT32_C(0xFFFFFFFF); 364 } 365 } 366 367 const u32 value = s_state.data_out_fifo.Pop(); 368 if (s_state.data_out_fifo.IsEmpty()) 369 Execute(); 370 else 371 UpdateStatus(); 372 373 return value; 374 } 375 376 void MDEC::WriteCommandRegister(u32 value) 377 { 378 TRACE_LOG("MDEC command/data register <- 0x{:08X}", value); 379 380 s_state.data_in_fifo.Push(Truncate16(value)); 381 s_state.data_in_fifo.Push(Truncate16(value >> 16)); 382 383 Execute(); 384 } 385 386 void MDEC::Execute() 387 { 388 for (;;) 389 { 390 switch (s_state.state) 391 { 392 case State::Idle: 393 { 394 if (s_state.data_in_fifo.GetSize() < 2) 395 goto finished; 396 397 // first word 398 const CommandWord cw{ZeroExtend32(s_state.data_in_fifo.Peek(0)) | 399 (ZeroExtend32(s_state.data_in_fifo.Peek(1)) << 16)}; 400 s_state.status.data_output_depth = cw.data_output_depth; 401 s_state.status.data_output_signed = cw.data_output_signed; 402 s_state.status.data_output_bit15 = cw.data_output_bit15; 403 s_state.data_in_fifo.Remove(2); 404 s_state.data_out_fifo.Clear(); 405 406 u32 num_words; 407 State new_state; 408 switch (cw.command) 409 { 410 case Command::DecodeMacroblock: 411 num_words = ZeroExtend32(cw.parameter_word_count.GetValue()); 412 new_state = State::DecodingMacroblock; 413 break; 414 415 case Command::SetIqTab: 416 num_words = 16 + (((cw.bits & 1) != 0) ? 16 : 0); 417 new_state = State::SetIqTable; 418 break; 419 420 case Command::SetScale: 421 num_words = 32; 422 new_state = State::SetScaleTable; 423 break; 424 425 default: 426 [[unlikely]] DEV_LOG("Invalid MDEC command 0x{:08X}", cw.bits); 427 num_words = cw.parameter_word_count.GetValue(); 428 new_state = State::NoCommand; 429 break; 430 } 431 432 DEBUG_LOG("MDEC command: 0x{:08X} ({}, {} words in parameter, {} expected)", cw.bits, 433 static_cast<u8>(cw.command.GetValue()), cw.parameter_word_count.GetValue(), num_words); 434 435 s_state.remaining_halfwords = num_words * 2; 436 s_state.state = new_state; 437 UpdateStatus(); 438 continue; 439 } 440 441 case State::DecodingMacroblock: 442 { 443 if (HandleDecodeMacroblockCommand()) 444 { 445 // we should be writing out now 446 DebugAssert(s_state.state == State::WritingMacroblock); 447 goto finished; 448 } 449 450 if (s_state.remaining_halfwords == 0 && s_state.current_block != NUM_BLOCKS) 451 { 452 // expecting data, but nothing more will be coming. bail out 453 ResetDecoder(); 454 s_state.state = State::Idle; 455 continue; 456 } 457 458 goto finished; 459 } 460 461 case State::WritingMacroblock: 462 { 463 // this gets executed via the event, so if we get here, wait. 464 goto finished; 465 } 466 467 case State::SetIqTable: 468 { 469 if (s_state.data_in_fifo.GetSize() < s_state.remaining_halfwords) 470 goto finished; 471 472 HandleSetQuantTableCommand(); 473 s_state.state = State::Idle; 474 UpdateStatus(); 475 continue; 476 } 477 478 case State::SetScaleTable: 479 { 480 if (s_state.data_in_fifo.GetSize() < s_state.remaining_halfwords) 481 goto finished; 482 483 HandleSetScaleCommand(); 484 s_state.state = State::Idle; 485 UpdateStatus(); 486 continue; 487 } 488 489 case State::NoCommand: 490 { 491 // can potentially have a large amount of halfwords, so eat them as we go 492 const u32 words_to_consume = std::min(s_state.remaining_halfwords, s_state.data_in_fifo.GetSize()); 493 s_state.data_in_fifo.Remove(words_to_consume); 494 s_state.remaining_halfwords -= words_to_consume; 495 if (s_state.remaining_halfwords == 0) 496 goto finished; 497 498 s_state.state = State::Idle; 499 UpdateStatus(); 500 continue; 501 } 502 503 default: 504 UnreachableCode(); 505 return; 506 } 507 } 508 509 finished: 510 // if we get here, it's because the FIFO is now empty 511 UpdateStatus(); 512 } 513 514 bool MDEC::HandleDecodeMacroblockCommand() 515 { 516 if (s_state.status.data_output_depth <= DataOutputDepth_8Bit) 517 return DecodeMonoMacroblock(); 518 else 519 return DecodeColoredMacroblock(); 520 } 521 522 bool MDEC::DecodeMonoMacroblock() 523 { 524 // TODO: This should guard the output not the input 525 if (!s_state.data_out_fifo.IsEmpty()) 526 return false; 527 528 if (g_settings.use_old_mdec_routines) [[unlikely]] 529 { 530 if (!DecodeRLE_Old(s_state.blocks[0].data(), s_state.iq_y.data())) 531 return false; 532 533 IDCT_Old(s_state.blocks[0].data()); 534 } 535 else 536 { 537 if (!DecodeRLE_New(s_state.blocks[0].data(), s_state.iq_y.data())) 538 return false; 539 540 IDCT_New(s_state.blocks[0].data()); 541 } 542 543 DEBUG_LOG("Decoded mono macroblock, {} words remaining", s_state.remaining_halfwords / 2); 544 ResetDecoder(); 545 s_state.state = State::WritingMacroblock; 546 547 YUVToMono(s_state.blocks[0]); 548 549 ScheduleBlockCopyOut(TICKS_PER_BLOCK * 6); 550 551 s_state.total_blocks_decoded++; 552 return true; 553 } 554 555 bool MDEC::DecodeColoredMacroblock() 556 { 557 if (g_settings.use_old_mdec_routines) [[unlikely]] 558 { 559 for (; s_state.current_block < NUM_BLOCKS; s_state.current_block++) 560 { 561 if (!DecodeRLE_Old(s_state.blocks[s_state.current_block].data(), 562 (s_state.current_block >= 2) ? s_state.iq_y.data() : s_state.iq_uv.data())) 563 return false; 564 565 IDCT_Old(s_state.blocks[s_state.current_block].data()); 566 } 567 568 if (!s_state.data_out_fifo.IsEmpty()) 569 return false; 570 571 // done decoding 572 DEBUG_LOG("Decoded colored macroblock, {} words remaining", s_state.remaining_halfwords / 2); 573 ResetDecoder(); 574 s_state.state = State::WritingMacroblock; 575 576 YUVToRGB_Old(0, 0, s_state.blocks[0], s_state.blocks[1], s_state.blocks[2]); 577 YUVToRGB_Old(8, 0, s_state.blocks[0], s_state.blocks[1], s_state.blocks[3]); 578 YUVToRGB_Old(0, 8, s_state.blocks[0], s_state.blocks[1], s_state.blocks[4]); 579 YUVToRGB_Old(8, 8, s_state.blocks[0], s_state.blocks[1], s_state.blocks[5]); 580 } 581 else 582 { 583 for (; s_state.current_block < NUM_BLOCKS; s_state.current_block++) 584 { 585 if (!DecodeRLE_New(s_state.blocks[s_state.current_block].data(), 586 (s_state.current_block >= 2) ? s_state.iq_y.data() : s_state.iq_uv.data())) 587 return false; 588 589 IDCT_New(s_state.blocks[s_state.current_block].data()); 590 } 591 592 if (!s_state.data_out_fifo.IsEmpty()) 593 return false; 594 595 // done decoding 596 DEBUG_LOG("Decoded colored macroblock, {} words remaining", s_state.remaining_halfwords / 2); 597 ResetDecoder(); 598 s_state.state = State::WritingMacroblock; 599 600 YUVToRGB_New(0, 0, s_state.blocks[0], s_state.blocks[1], s_state.blocks[2]); 601 YUVToRGB_New(8, 0, s_state.blocks[0], s_state.blocks[1], s_state.blocks[3]); 602 YUVToRGB_New(0, 8, s_state.blocks[0], s_state.blocks[1], s_state.blocks[4]); 603 YUVToRGB_New(8, 8, s_state.blocks[0], s_state.blocks[1], s_state.blocks[5]); 604 } 605 606 s_state.total_blocks_decoded += 4; 607 608 ScheduleBlockCopyOut(TICKS_PER_BLOCK * 6); 609 return true; 610 } 611 612 void MDEC::ScheduleBlockCopyOut(TickCount ticks) 613 { 614 DebugAssert(!HasPendingBlockCopyOut()); 615 DEBUG_LOG("Scheduling block copy out in {} ticks", ticks); 616 617 s_state.block_copy_out_event.SetIntervalAndSchedule(ticks); 618 } 619 620 void MDEC::CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late) 621 { 622 Assert(s_state.state == State::WritingMacroblock); 623 s_state.block_copy_out_event.Deactivate(); 624 625 switch (s_state.status.data_output_depth) 626 { 627 case DataOutputDepth_4Bit: 628 { 629 const u32* in_ptr = s_state.block_rgb.data(); 630 for (u32 i = 0; i < (64 / 8); i++) 631 { 632 u32 value = *(in_ptr++) >> 4; 633 value |= (*(in_ptr++) >> 4) << 4; 634 value |= (*(in_ptr++) >> 4) << 8; 635 value |= (*(in_ptr++) >> 4) << 12; 636 value |= (*(in_ptr++) >> 4) << 16; 637 value |= (*(in_ptr++) >> 4) << 20; 638 value |= (*(in_ptr++) >> 4) << 24; 639 value |= (*(in_ptr++) >> 4) << 28; 640 s_state.data_out_fifo.Push(value); 641 } 642 } 643 break; 644 645 case DataOutputDepth_8Bit: 646 { 647 const u32* in_ptr = s_state.block_rgb.data(); 648 for (u32 i = 0; i < (64 / 4); i++) 649 { 650 u32 value = *in_ptr++; 651 value |= *in_ptr++ << 8; 652 value |= *in_ptr++ << 16; 653 value |= *in_ptr++ << 24; 654 s_state.data_out_fifo.Push(value); 655 } 656 } 657 break; 658 659 case DataOutputDepth_24Bit: 660 { 661 // pack tightly 662 u32 index = 0; 663 u32 state = 0; 664 u32 rgb = 0; 665 while (index < s_state.block_rgb.size()) 666 { 667 switch (state) 668 { 669 case 0: 670 rgb = s_state.block_rgb[index++]; // RGB- 671 state = 1; 672 break; 673 case 1: 674 rgb |= (s_state.block_rgb[index] & 0xFF) << 24; // RGBR 675 s_state.data_out_fifo.Push(rgb); 676 rgb = s_state.block_rgb[index] >> 8; // GB-- 677 index++; 678 state = 2; 679 break; 680 case 2: 681 rgb |= s_state.block_rgb[index] << 16; // GBRG 682 s_state.data_out_fifo.Push(rgb); 683 rgb = s_state.block_rgb[index] >> 16; // B--- 684 index++; 685 state = 3; 686 break; 687 case 3: 688 rgb |= s_state.block_rgb[index] << 8; // BRGB 689 s_state.data_out_fifo.Push(rgb); 690 index++; 691 state = 0; 692 break; 693 } 694 } 695 break; 696 } 697 698 case DataOutputDepth_15Bit: 699 { 700 if (g_settings.use_old_mdec_routines) [[unlikely]] 701 { 702 const u16 a = ZeroExtend16(s_state.status.data_output_bit15.GetValue()) << 15; 703 for (u32 i = 0; i < static_cast<u32>(s_state.block_rgb.size());) 704 { 705 u32 color = s_state.block_rgb[i++]; 706 u16 r = Truncate16((color >> 3) & 0x1Fu); 707 u16 g = Truncate16((color >> 11) & 0x1Fu); 708 u16 b = Truncate16((color >> 19) & 0x1Fu); 709 const u16 color15a = r | (g << 5) | (b << 10) | (a << 15); 710 711 color = s_state.block_rgb[i++]; 712 r = Truncate16((color >> 3) & 0x1Fu); 713 g = Truncate16((color >> 11) & 0x1Fu); 714 b = Truncate16((color >> 19) & 0x1Fu); 715 const u16 color15b = r | (g << 5) | (b << 10) | (a << 15); 716 717 s_state.data_out_fifo.Push(ZeroExtend32(color15a) | (ZeroExtend32(color15b) << 16)); 718 } 719 } 720 else 721 { 722 const u32 a = ZeroExtend32(s_state.status.data_output_bit15.GetValue()) << 15; 723 for (u32 i = 0; i < static_cast<u32>(s_state.block_rgb.size());) 724 { 725 #define E8TO5(color) (std::min<u32>((((color) + 4) >> 3), 0x1F)) 726 u32 color = s_state.block_rgb[i++]; 727 u32 r = E8TO5(color & 0xFFu); 728 u32 g = E8TO5((color >> 8) & 0xFFu); 729 u32 b = E8TO5((color >> 16) & 0xFFu); 730 const u32 color15a = r | (g << 5) | (b << 10) | a; 731 732 color = s_state.block_rgb[i++]; 733 r = E8TO5(color & 0xFFu); 734 g = E8TO5((color >> 8) & 0xFFu); 735 b = E8TO5((color >> 16) & 0xFFu); 736 const u32 color15b = r | (g << 5) | (b << 10) | a; 737 #undef E8TO5 738 739 s_state.data_out_fifo.Push(color15a | (color15b << 16)); 740 } 741 } 742 } 743 break; 744 745 default: 746 break; 747 } 748 749 DEBUG_LOG("Block copied out, fifo size = {} ({} bytes)", s_state.data_out_fifo.GetSize(), 750 s_state.data_out_fifo.GetSize() * sizeof(u32)); 751 752 // if we've copied out all blocks, command is complete 753 s_state.state = (s_state.remaining_halfwords == 0) ? State::Idle : State::DecodingMacroblock; 754 Execute(); 755 } 756 757 bool MDEC::DecodeRLE_Old(s16* blk, const u8* qt) 758 { 759 static constexpr std::array<u8, 64> zagzig = {{0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 760 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 761 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 762 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63}}; 763 764 if (s_state.current_coefficient == 64) 765 { 766 std::fill_n(blk, 64, s16(0)); 767 768 // skip padding at start 769 u16 n; 770 for (;;) 771 { 772 if (s_state.data_in_fifo.IsEmpty() || s_state.remaining_halfwords == 0) 773 return false; 774 775 n = s_state.data_in_fifo.Pop(); 776 s_state.remaining_halfwords--; 777 778 if (n == 0xFE00) 779 continue; 780 else 781 break; 782 } 783 784 s_state.current_coefficient = 0; 785 s_state.current_q_scale = (n >> 10) & 0x3F; 786 s32 val = SignExtendN<10, s32>(static_cast<s32>(n & 0x3FF)) * 787 static_cast<s32>(ZeroExtend32(qt[s_state.current_coefficient])); 788 789 if (s_state.current_q_scale == 0) 790 val = SignExtendN<10, s32>(static_cast<s32>(n & 0x3FF)) * 2; 791 792 val = std::clamp(val, -0x400, 0x3FF); 793 if (s_state.current_q_scale > 0) 794 blk[zagzig[s_state.current_coefficient]] = static_cast<s16>(val); 795 else 796 blk[s_state.current_coefficient] = static_cast<s16>(val); 797 } 798 799 while (!s_state.data_in_fifo.IsEmpty() && s_state.remaining_halfwords > 0) 800 { 801 u16 n = s_state.data_in_fifo.Pop(); 802 s_state.remaining_halfwords--; 803 804 s_state.current_coefficient += ((n >> 10) & 0x3F) + 1; 805 if (s_state.current_coefficient < 64) 806 { 807 s32 val = 808 (SignExtendN<10, s32>(static_cast<s32>(n & 0x3FF)) * 809 static_cast<s32>(ZeroExtend32(qt[s_state.current_coefficient])) * static_cast<s32>(s_state.current_q_scale) + 810 4) / 811 8; 812 813 if (s_state.current_q_scale == 0) 814 val = SignExtendN<10, s32>(static_cast<s32>(n & 0x3FF)) * 2; 815 816 val = std::clamp(val, -0x400, 0x3FF); 817 if (s_state.current_q_scale > 0) 818 blk[zagzig[s_state.current_coefficient]] = static_cast<s16>(val); 819 else 820 blk[s_state.current_coefficient] = static_cast<s16>(val); 821 } 822 823 if (s_state.current_coefficient >= 63) 824 { 825 s_state.current_coefficient = 64; 826 return true; 827 } 828 } 829 830 return false; 831 } 832 833 void MDEC::IDCT_Old(s16* blk) 834 { 835 std::array<s64, 64> temp_buffer; 836 for (u32 x = 0; x < 8; x++) 837 { 838 for (u32 y = 0; y < 8; y++) 839 { 840 s64 sum = 0; 841 for (u32 u = 0; u < 8; u++) 842 sum += s32(blk[u * 8 + x]) * s32(s_state.scale_table[y * 8 + u]); 843 temp_buffer[x + y * 8] = sum; 844 } 845 } 846 for (u32 x = 0; x < 8; x++) 847 { 848 for (u32 y = 0; y < 8; y++) 849 { 850 s64 sum = 0; 851 for (u32 u = 0; u < 8; u++) 852 sum += s64(temp_buffer[u + y * 8]) * s32(s_state.scale_table[x * 8 + u]); 853 854 blk[x + y * 8] = 855 static_cast<s16>(std::clamp<s32>(SignExtendN<9, s32>((sum >> 32) + ((sum >> 31) & 1)), -128, 127)); 856 } 857 } 858 } 859 860 void MDEC::YUVToRGB_Old(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk, 861 const std::array<s16, 64>& Yblk) 862 { 863 const s16 addval = s_state.status.data_output_signed ? 0 : 0x80; 864 for (u32 y = 0; y < 8; y++) 865 { 866 for (u32 x = 0; x < 8; x++) 867 { 868 s16 R = Crblk[((x + xx) / 2) + ((y + yy) / 2) * 8]; 869 s16 B = Cbblk[((x + xx) / 2) + ((y + yy) / 2) * 8]; 870 s16 G = static_cast<s16>((-0.3437f * static_cast<float>(B)) + (-0.7143f * static_cast<float>(R))); 871 872 R = static_cast<s16>(1.402f * static_cast<float>(R)); 873 B = static_cast<s16>(1.772f * static_cast<float>(B)); 874 875 s16 Y = Yblk[x + y * 8]; 876 R = static_cast<s16>(std::clamp(static_cast<int>(Y) + R, -128, 127)) + addval; 877 G = static_cast<s16>(std::clamp(static_cast<int>(Y) + G, -128, 127)) + addval; 878 B = static_cast<s16>(std::clamp(static_cast<int>(Y) + B, -128, 127)) + addval; 879 880 s_state.block_rgb[(x + xx) + ((y + yy) * 16)] = ZeroExtend32(static_cast<u16>(R)) | 881 (ZeroExtend32(static_cast<u16>(G)) << 8) | 882 (ZeroExtend32(static_cast<u16>(B)) << 16); 883 } 884 } 885 } 886 887 bool MDEC::DecodeRLE_New(s16* blk, const u8* qt) 888 { 889 // Swapped to row-major so we can vectorize the IDCT. 890 static constexpr std::array<u8, 64> zigzag = {{0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 891 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, 892 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, 893 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63}}; 894 895 if (s_state.current_coefficient == 64) 896 { 897 std::fill_n(blk, 64, s16(0)); 898 899 // skip padding at start 900 u16 n; 901 for (;;) 902 { 903 if (s_state.data_in_fifo.IsEmpty() || s_state.remaining_halfwords == 0) 904 return false; 905 906 n = s_state.data_in_fifo.Pop(); 907 s_state.remaining_halfwords--; 908 909 if (n == 0xFE00) 910 continue; 911 else 912 break; 913 } 914 915 s_state.current_coefficient = 0; 916 s_state.current_q_scale = n >> 10; 917 918 // Store the DCT blocks with an additional 4 bits of precision. 919 const s32 val = SignExtendN<10, s32>(static_cast<s32>(n)); 920 const s32 coeff = 921 (s_state.current_q_scale == 0) ? (val << 5) : (((val * qt[0]) << 4) + (val ? ((val < 0) ? 8 : -8) : 0)); 922 blk[zigzag[0]] = static_cast<s16>(std::clamp(coeff, -0x4000, 0x3FFF)); 923 } 924 925 while (!s_state.data_in_fifo.IsEmpty() && s_state.remaining_halfwords > 0) 926 { 927 u16 n = s_state.data_in_fifo.Pop(); 928 s_state.remaining_halfwords--; 929 930 s_state.current_coefficient += ((n >> 10) + 1); 931 if (s_state.current_coefficient < 64) 932 { 933 const s32 val = SignExtendN<10, s32>(n); 934 const s32 scq = static_cast<s32>(s_state.current_q_scale * qt[s_state.current_coefficient]); 935 const s32 coeff = (scq == 0) ? (val << 5) : ((((val * scq) >> 3) << 4) + (val ? ((val < 0) ? 8 : -8) : 0)); 936 blk[zigzag[s_state.current_coefficient]] = static_cast<s16>(std::clamp(coeff, -0x4000, 0x3FFF)); 937 } 938 939 if (s_state.current_coefficient >= 63) 940 { 941 s_state.current_coefficient = 64; 942 return true; 943 } 944 } 945 946 return false; 947 } 948 949 static s16 IDCTRow(const s16* blk, const s16* idct_matrix) 950 { 951 // IDCT matrix is -32768..32767, block is -16384..16383. 4 adds can happen without overflow. 952 GSVector4i sum = GSVector4i::load<false>(blk).madd_s16(GSVector4i::load<true>(idct_matrix)).addp_s32(); 953 return static_cast<s16>(((static_cast<s64>(sum.extract32<0>()) + static_cast<s64>(sum.extract32<1>())) + 0x20000) >> 954 18); 955 } 956 957 void MDEC::IDCT_New(s16* blk) 958 { 959 alignas(VECTOR_ALIGNMENT) std::array<s16, 64> temp; 960 for (u32 x = 0; x < 8; x++) 961 { 962 for (u32 y = 0; y < 8; y++) 963 temp[y * 8 + x] = IDCTRow(&blk[x * 8], &s_state.scale_table[y * 8]); 964 } 965 for (u32 x = 0; x < 8; x++) 966 { 967 for (u32 y = 0; y < 8; y++) 968 { 969 const s32 sum = IDCTRow(&temp[x * 8], &s_state.scale_table[y * 8]); 970 blk[x * 8 + y] = static_cast<s16>(std::clamp(SignExtendN<9, s32>(sum), -128, 127)); 971 } 972 } 973 } 974 975 void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk, 976 const std::array<s16, 64>& Yblk) 977 { 978 const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080); 979 for (u32 y = 0; y < 8; y++) 980 { 981 const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32(); 982 const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32(); 983 const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]); 984 985 // BT.601 YUV->RGB coefficients, rounding formula from Mednafen. 986 // r = clamp(sext9(Y + (((359 * Cr) + 0x80) >> 8)), -128, 127) + addval; 987 // g = clamp(sext9(Y + ((((-88 * Cb) & ~0x1F) + ((-183 * Cr) & ~0x07) + 0x80) >> 8)), -128, 127) + addval 988 // b = clamp(sext9<9, s32>(Y + (((454 * Cb) + 0x80) >> 8)), -128, 127) + addval 989 990 // Need to do the multiply as 32-bit, since 127 * 359 is greater than INT16_MAX. 991 // upl16(self) = interleave XYZW0000 -> XXYYZZWW. 992 const GSVector4i Crmul = Cr.mul32l(GSVector4i::cxpr(359)).add16(GSVector4i::cxpr(0x80)).sra32<8>().ps32(); 993 const GSVector4i Cbmul = Cb.mul32l(GSVector4i::cxpr(454)).add16(GSVector4i::cxpr(0x80)).sra32<8>().ps32(); 994 const GSVector4i CrCbmul = (Cb.mul32l(GSVector4i::cxpr(-88)) & GSVector4i::cxpr(~0x1F)) 995 .add32(Cr.mul32l(GSVector4i::cxpr(-183)) & GSVector4i::cxpr(~0x07)) 996 .add32(GSVector4i::cxpr(0x80)) 997 .sra32<8>() 998 .ps32(); 999 const GSVector4i r = Crmul.upl16(Crmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval); 1000 const GSVector4i g = CrCbmul.upl16(CrCbmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval); 1001 const GSVector4i b = Cbmul.upl16(Cbmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval); 1002 const GSVector4i rg = r.upl8(g); 1003 const GSVector4i b0 = b.upl8(); 1004 const GSVector4i rgblow = rg.upl16(b0); 1005 const GSVector4i rgbhigh = rg.uph16(b0); 1006 1007 u32* const out_row = &s_state.block_rgb[xx + ((y + yy) * 16)]; 1008 GSVector4i::store<false>(&out_row[0], rgblow); 1009 GSVector4i::store<false>(&out_row[4], rgbhigh); 1010 } 1011 } 1012 1013 void MDEC::YUVToMono(const std::array<s16, 64>& Yblk) 1014 { 1015 const s32 addval = s_state.status.data_output_signed ? 0 : 0x80; 1016 for (u32 i = 0; i < 64; i++) 1017 s_state.block_rgb[i] = static_cast<u32>(std::clamp(SignExtendN<9, s32>(Yblk[i]), -128, 127) + addval); 1018 } 1019 1020 void MDEC::HandleSetQuantTableCommand() 1021 { 1022 DebugAssert(s_state.remaining_halfwords >= 32); 1023 1024 // TODO: Remove extra copies.. 1025 std::array<u16, 32> packed_data; 1026 s_state.data_in_fifo.PopRange(packed_data.data(), static_cast<u32>(packed_data.size())); 1027 s_state.remaining_halfwords -= 32; 1028 std::memcpy(s_state.iq_y.data(), packed_data.data(), s_state.iq_y.size()); 1029 1030 if (s_state.remaining_halfwords > 0) 1031 { 1032 DebugAssert(s_state.remaining_halfwords >= 32); 1033 1034 s_state.data_in_fifo.PopRange(packed_data.data(), static_cast<u32>(packed_data.size())); 1035 std::memcpy(s_state.iq_uv.data(), packed_data.data(), s_state.iq_uv.size()); 1036 } 1037 } 1038 1039 void MDEC::HandleSetScaleCommand() 1040 { 1041 DebugAssert(s_state.remaining_halfwords == 64); 1042 1043 std::array<u16, 64> packed_data; 1044 s_state.data_in_fifo.PopRange(packed_data.data(), static_cast<u32>(packed_data.size())); 1045 s_state.remaining_halfwords -= 32; 1046 SetScaleMatrix(packed_data.data()); 1047 } 1048 1049 void MDEC::SetScaleMatrix(const u16* values) 1050 { 1051 for (u32 y = 0; y < 8; y++) 1052 { 1053 for (u32 x = 0; x < 8; x++) 1054 s_state.scale_table[y * 8 + x] = values[x * 8 + y]; 1055 } 1056 } 1057 1058 void MDEC::DrawDebugStateWindow() 1059 { 1060 const float framebuffer_scale = ImGuiManager::GetGlobalScale(); 1061 1062 ImGui::SetNextWindowSize(ImVec2(300.0f * framebuffer_scale, 350.0f * framebuffer_scale), ImGuiCond_FirstUseEver); 1063 if (!ImGui::Begin("MDEC State", nullptr)) 1064 { 1065 ImGui::End(); 1066 return; 1067 } 1068 1069 static constexpr std::array<const char*, 5> state_names = { 1070 {"None", "Decoding Macroblock", "Writing Macroblock", "SetIqTab", "SetScale"}}; 1071 static constexpr std::array<const char*, 4> output_depths = {{"4-bit", "8-bit", "24-bit", "15-bit"}}; 1072 static constexpr std::array<const char*, 7> block_names = {{"Crblk", "Cbblk", "Y1", "Y2", "Y3", "Y4", "Output"}}; 1073 1074 ImGui::Text("Blocks Decoded: %u", s_state.total_blocks_decoded); 1075 ImGui::Text("Data-In FIFO Size: %u (%u bytes)", s_state.data_in_fifo.GetSize(), s_state.data_in_fifo.GetSize() * 4); 1076 ImGui::Text("Data-Out FIFO Size: %u (%u bytes)", s_state.data_out_fifo.GetSize(), 1077 s_state.data_out_fifo.GetSize() * 4); 1078 ImGui::Text("DMA Enable: %s%s", s_state.enable_dma_in ? "In " : "", s_state.enable_dma_out ? "Out" : ""); 1079 ImGui::Text("Current State: %s", state_names[static_cast<u8>(s_state.state)]); 1080 ImGui::Text("Current Block: %s", block_names[s_state.current_block]); 1081 ImGui::Text("Current Coefficient: %u", s_state.current_coefficient); 1082 1083 if (ImGui::CollapsingHeader("Status", ImGuiTreeNodeFlags_DefaultOpen)) 1084 { 1085 ImGui::Text("Data-Out FIFO Empty: %s", s_state.status.data_out_fifo_empty ? "Yes" : "No"); 1086 ImGui::Text("Data-In FIFO Full: %s", s_state.status.data_in_fifo_full ? "Yes" : "No"); 1087 ImGui::Text("Command Busy: %s", s_state.status.command_busy ? "Yes" : "No"); 1088 ImGui::Text("Data-In Request: %s", s_state.status.data_in_request ? "Yes" : "No"); 1089 ImGui::Text("Output Depth: %s", output_depths[static_cast<u8>(s_state.status.data_output_depth.GetValue())]); 1090 ImGui::Text("Output Signed: %s", s_state.status.data_output_signed ? "Yes" : "No"); 1091 ImGui::Text("Output Bit 15: %u", ZeroExtend32(s_state.status.data_output_bit15.GetValue())); 1092 ImGui::Text("Current Block: %u", ZeroExtend32(s_state.status.current_block.GetValue())); 1093 ImGui::Text("Parameter Words Remaining: %d", 1094 static_cast<s32>(SignExtend32(s_state.status.parameter_words_remaining.GetValue()))); 1095 } 1096 1097 ImGui::End(); 1098 }