duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

mdec.cpp (34332B)


      1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "mdec.h"
      5 #include "cpu_core.h"
      6 #include "dma.h"
      7 #include "system.h"
      8 #include "timing_event.h"
      9 
     10 #include "util/imgui_manager.h"
     11 #include "util/state_wrapper.h"
     12 
     13 #include "common/bitfield.h"
     14 #include "common/fifo_queue.h"
     15 #include "common/gsvector.h"
     16 #include "common/log.h"
     17 
     18 #include "imgui.h"
     19 
     20 #include <array>
     21 #include <memory>
     22 
     23 Log_SetChannel(MDEC);
     24 
     25 namespace MDEC {
     26 namespace {
     27 
     28 static constexpr u32 DATA_IN_FIFO_SIZE = 1024;
     29 static constexpr u32 DATA_OUT_FIFO_SIZE = 768;
     30 static constexpr u32 NUM_BLOCKS = 6;
     31 static constexpr TickCount TICKS_PER_BLOCK = 448;
     32 
     33 enum DataOutputDepth : u8
     34 {
     35   DataOutputDepth_4Bit = 0,
     36   DataOutputDepth_8Bit = 1,
     37   DataOutputDepth_24Bit = 2,
     38   DataOutputDepth_15Bit = 3
     39 };
     40 
     41 enum class Command : u8
     42 {
     43   None = 0,
     44   DecodeMacroblock = 1,
     45   SetIqTab = 2,
     46   SetScale = 3
     47 };
     48 
     49 enum class State : u8
     50 {
     51   Idle,
     52   DecodingMacroblock,
     53   WritingMacroblock,
     54   SetIqTable,
     55   SetScaleTable,
     56   NoCommand
     57 };
     58 
     59 union StatusRegister
     60 {
     61   u32 bits;
     62 
     63   BitField<u32, bool, 31, 1> data_out_fifo_empty;
     64   BitField<u32, bool, 30, 1> data_in_fifo_full;
     65   BitField<u32, bool, 29, 1> command_busy;
     66   BitField<u32, bool, 28, 1> data_in_request;
     67   BitField<u32, bool, 27, 1> data_out_request;
     68   BitField<u32, DataOutputDepth, 25, 2> data_output_depth;
     69   BitField<u32, bool, 24, 1> data_output_signed;
     70   BitField<u32, u8, 23, 1> data_output_bit15;
     71   BitField<u32, u8, 16, 3> current_block;
     72   BitField<u32, u16, 0, 16> parameter_words_remaining;
     73 };
     74 
     75 union ControlRegister
     76 {
     77   u32 bits;
     78   BitField<u32, bool, 31, 1> reset;
     79   BitField<u32, bool, 30, 1> enable_dma_in;
     80   BitField<u32, bool, 29, 1> enable_dma_out;
     81 };
     82 
     83 union CommandWord
     84 {
     85   u32 bits;
     86 
     87   BitField<u32, Command, 29, 3> command;
     88   BitField<u32, DataOutputDepth, 27, 2> data_output_depth;
     89   BitField<u32, bool, 26, 1> data_output_signed;
     90   BitField<u32, u8, 25, 1> data_output_bit15;
     91   BitField<u32, u16, 0, 16> parameter_word_count;
     92 };
     93 
     94 } // namespace
     95 
     96 static bool HasPendingBlockCopyOut();
     97 
     98 static void SoftReset();
     99 static void ResetDecoder();
    100 static void UpdateStatus();
    101 
    102 static u32 ReadDataRegister();
    103 static void WriteCommandRegister(u32 value);
    104 static void Execute();
    105 
    106 static bool HandleDecodeMacroblockCommand();
    107 static void HandleSetQuantTableCommand();
    108 static void HandleSetScaleCommand();
    109 
    110 static void SetScaleMatrix(const u16* values);
    111 static bool DecodeMonoMacroblock();
    112 static bool DecodeColoredMacroblock();
    113 static void ScheduleBlockCopyOut(TickCount ticks);
    114 static void CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late);
    115 
    116 static bool DecodeRLE_Old(s16* blk, const u8* qt);
    117 static void IDCT_Old(s16* blk);
    118 static void YUVToRGB_Old(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk,
    119                          const std::array<s16, 64>& Yblk);
    120 
    121 static bool DecodeRLE_New(s16* blk, const u8* qt);
    122 static void IDCT_New(s16* blk);
    123 static void YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk,
    124                          const std::array<s16, 64>& Yblk);
    125 
    126 static void YUVToMono(const std::array<s16, 64>& Yblk);
    127 
    128 namespace {
    129 struct MDECState
    130 {
    131   StatusRegister status = {};
    132   bool enable_dma_in = false;
    133   bool enable_dma_out = false;
    134 
    135   // Even though the DMA is in words, we access the FIFO as halfwords.
    136   InlineFIFOQueue<u16, DATA_IN_FIFO_SIZE / sizeof(u16)> data_in_fifo;
    137   InlineFIFOQueue<u32, DATA_OUT_FIFO_SIZE / sizeof(u32)> data_out_fifo;
    138   State state = State::Idle;
    139   u32 remaining_halfwords = 0;
    140 
    141   std::array<u8, 64> iq_uv{};
    142   std::array<u8, 64> iq_y{};
    143 
    144   alignas(VECTOR_ALIGNMENT) std::array<s16, 64> scale_table{};
    145 
    146   // blocks, for colour: 0 - Crblk, 1 - Cbblk, 2-5 - Y 1-4
    147   alignas(VECTOR_ALIGNMENT) std::array<std::array<s16, 64>, NUM_BLOCKS> blocks;
    148   u32 current_block = 0;        // block (0-5)
    149   u32 current_coefficient = 64; // k (in block)
    150   u16 current_q_scale = 0;
    151 
    152   alignas(16) std::array<u32, 256> block_rgb{};
    153   TimingEvent block_copy_out_event{"MDEC Block Copy Out", 1, 1, &MDEC::CopyOutBlock, nullptr};
    154 
    155   u32 total_blocks_decoded = 0;
    156 };
    157 } // namespace
    158 
    159 ALIGN_TO_CACHE_LINE static MDECState s_state;
    160 } // namespace MDEC
    161 
    162 void MDEC::Initialize()
    163 {
    164   s_state.total_blocks_decoded = 0;
    165   Reset();
    166 }
    167 
    168 void MDEC::Shutdown()
    169 {
    170   s_state.block_copy_out_event.Deactivate();
    171 }
    172 
    173 void MDEC::Reset()
    174 {
    175   s_state.block_copy_out_event.Deactivate();
    176   SoftReset();
    177 }
    178 
    179 bool MDEC::DoState(StateWrapper& sw)
    180 {
    181   sw.Do(&s_state.status.bits);
    182   sw.Do(&s_state.enable_dma_in);
    183   sw.Do(&s_state.enable_dma_out);
    184   sw.Do(&s_state.data_in_fifo);
    185   sw.Do(&s_state.data_out_fifo);
    186   sw.Do(&s_state.state);
    187   sw.Do(&s_state.remaining_halfwords);
    188   sw.Do(&s_state.iq_uv);
    189   sw.Do(&s_state.iq_y);
    190 
    191   if (sw.GetVersion() < 66) [[unlikely]]
    192   {
    193     std::array<u16, 64> old_scale_matrix;
    194     sw.Do(&old_scale_matrix);
    195     SetScaleMatrix(old_scale_matrix.data());
    196   }
    197   else
    198   {
    199     sw.Do(&s_state.scale_table);
    200   }
    201 
    202   sw.Do(&s_state.blocks);
    203   sw.Do(&s_state.current_block);
    204   sw.Do(&s_state.current_coefficient);
    205   sw.Do(&s_state.current_q_scale);
    206   sw.Do(&s_state.block_rgb);
    207 
    208   bool block_copy_out_pending = HasPendingBlockCopyOut();
    209   sw.Do(&block_copy_out_pending);
    210   if (sw.IsReading())
    211     s_state.block_copy_out_event.SetState(block_copy_out_pending);
    212 
    213   return !sw.HasError();
    214 }
    215 
    216 u32 MDEC::ReadRegister(u32 offset)
    217 {
    218   switch (offset)
    219   {
    220     case 0:
    221       return ReadDataRegister();
    222 
    223     case 4:
    224     {
    225       TRACE_LOG("MDEC status register -> 0x{:08X}", s_state.status.bits);
    226       return s_state.status.bits;
    227     }
    228 
    229       [[unlikely]] default:
    230       {
    231         ERROR_LOG("Unknown MDEC register read: 0x{:08X}", offset);
    232         return UINT32_C(0xFFFFFFFF);
    233       }
    234   }
    235 }
    236 
    237 void MDEC::WriteRegister(u32 offset, u32 value)
    238 {
    239   switch (offset)
    240   {
    241     case 0:
    242     {
    243       WriteCommandRegister(value);
    244       return;
    245     }
    246 
    247     case 4:
    248     {
    249       DEBUG_LOG("MDEC control register <- 0x{:08X}", value);
    250 
    251       const ControlRegister cr{value};
    252       if (cr.reset)
    253         SoftReset();
    254 
    255       s_state.enable_dma_in = cr.enable_dma_in;
    256       s_state.enable_dma_out = cr.enable_dma_out;
    257       Execute();
    258       return;
    259     }
    260 
    261       [[unlikely]] default:
    262       {
    263         ERROR_LOG("Unknown MDEC register write: 0x{:08X} <- 0x{:08X}", offset, value);
    264         return;
    265       }
    266   }
    267 }
    268 
    269 void MDEC::DMARead(u32* words, u32 word_count)
    270 {
    271   if (s_state.data_out_fifo.GetSize() < word_count) [[unlikely]]
    272   {
    273     WARNING_LOG("Insufficient data in output FIFO (requested {}, have {})", word_count,
    274                 s_state.data_out_fifo.GetSize());
    275   }
    276 
    277   const u32 words_to_read = std::min(word_count, s_state.data_out_fifo.GetSize());
    278   if (words_to_read > 0)
    279   {
    280     s_state.data_out_fifo.PopRange(words, words_to_read);
    281     words += words_to_read;
    282     word_count -= words_to_read;
    283   }
    284 
    285   DEBUG_LOG("DMA read complete, {} bytes left", s_state.data_out_fifo.GetSize() * sizeof(u32));
    286   if (s_state.data_out_fifo.IsEmpty())
    287     Execute();
    288 }
    289 
    290 void MDEC::DMAWrite(const u32* words, u32 word_count)
    291 {
    292   if (s_state.data_in_fifo.GetSpace() < (word_count * 2)) [[unlikely]]
    293   {
    294     WARNING_LOG("Input FIFO overflow (writing {}, space {})", word_count * 2, s_state.data_in_fifo.GetSpace());
    295   }
    296 
    297   const u32 halfwords_to_write = std::min(word_count * 2, s_state.data_in_fifo.GetSpace() & ~u32(2));
    298   s_state.data_in_fifo.PushRange(reinterpret_cast<const u16*>(words), halfwords_to_write);
    299   Execute();
    300 }
    301 
    302 bool MDEC::HasPendingBlockCopyOut()
    303 {
    304   return s_state.block_copy_out_event.IsActive();
    305 }
    306 
    307 void MDEC::SoftReset()
    308 {
    309   s_state.status.bits = 0;
    310   s_state.enable_dma_in = false;
    311   s_state.enable_dma_out = false;
    312   s_state.data_in_fifo.Clear();
    313   s_state.data_out_fifo.Clear();
    314   s_state.state = State::Idle;
    315   s_state.remaining_halfwords = 0;
    316   s_state.current_block = 0;
    317   s_state.current_coefficient = 64;
    318   s_state.current_q_scale = 0;
    319   s_state.block_copy_out_event.Deactivate();
    320   UpdateStatus();
    321 }
    322 
    323 void MDEC::ResetDecoder()
    324 {
    325   s_state.current_block = 0;
    326   s_state.current_coefficient = 64;
    327   s_state.current_q_scale = 0;
    328 }
    329 
    330 void MDEC::UpdateStatus()
    331 {
    332   s_state.status.data_out_fifo_empty = s_state.data_out_fifo.IsEmpty();
    333   s_state.status.data_in_fifo_full = s_state.data_in_fifo.IsFull();
    334 
    335   s_state.status.command_busy = (s_state.state != State::Idle);
    336   s_state.status.parameter_words_remaining = Truncate16((s_state.remaining_halfwords / 2) - 1);
    337   s_state.status.current_block = (s_state.current_block + 4) % NUM_BLOCKS;
    338 
    339   // we always want data in if it's enabled
    340   const bool data_in_request = s_state.enable_dma_in && s_state.data_in_fifo.GetSpace() >= (32 * 2);
    341   s_state.status.data_in_request = data_in_request;
    342   DMA::SetRequest(DMA::Channel::MDECin, data_in_request);
    343 
    344   // we only want to send data out if we have some in the fifo
    345   const bool data_out_request = s_state.enable_dma_out && !s_state.data_out_fifo.IsEmpty();
    346   s_state.status.data_out_request = data_out_request;
    347   DMA::SetRequest(DMA::Channel::MDECout, data_out_request);
    348 }
    349 
    350 u32 MDEC::ReadDataRegister()
    351 {
    352   if (s_state.data_out_fifo.IsEmpty())
    353   {
    354     // Stall the CPU until we're done processing.
    355     if (HasPendingBlockCopyOut())
    356     {
    357       DEV_LOG("MDEC data out FIFO empty on read - stalling CPU");
    358       CPU::AddPendingTicks(s_state.block_copy_out_event.GetTicksUntilNextExecution());
    359     }
    360     else
    361     {
    362       WARNING_LOG("MDEC data out FIFO empty on read and no data processing");
    363       return UINT32_C(0xFFFFFFFF);
    364     }
    365   }
    366 
    367   const u32 value = s_state.data_out_fifo.Pop();
    368   if (s_state.data_out_fifo.IsEmpty())
    369     Execute();
    370   else
    371     UpdateStatus();
    372 
    373   return value;
    374 }
    375 
    376 void MDEC::WriteCommandRegister(u32 value)
    377 {
    378   TRACE_LOG("MDEC command/data register <- 0x{:08X}", value);
    379 
    380   s_state.data_in_fifo.Push(Truncate16(value));
    381   s_state.data_in_fifo.Push(Truncate16(value >> 16));
    382 
    383   Execute();
    384 }
    385 
    386 void MDEC::Execute()
    387 {
    388   for (;;)
    389   {
    390     switch (s_state.state)
    391     {
    392       case State::Idle:
    393       {
    394         if (s_state.data_in_fifo.GetSize() < 2)
    395           goto finished;
    396 
    397         // first word
    398         const CommandWord cw{ZeroExtend32(s_state.data_in_fifo.Peek(0)) |
    399                              (ZeroExtend32(s_state.data_in_fifo.Peek(1)) << 16)};
    400         s_state.status.data_output_depth = cw.data_output_depth;
    401         s_state.status.data_output_signed = cw.data_output_signed;
    402         s_state.status.data_output_bit15 = cw.data_output_bit15;
    403         s_state.data_in_fifo.Remove(2);
    404         s_state.data_out_fifo.Clear();
    405 
    406         u32 num_words;
    407         State new_state;
    408         switch (cw.command)
    409         {
    410           case Command::DecodeMacroblock:
    411             num_words = ZeroExtend32(cw.parameter_word_count.GetValue());
    412             new_state = State::DecodingMacroblock;
    413             break;
    414 
    415           case Command::SetIqTab:
    416             num_words = 16 + (((cw.bits & 1) != 0) ? 16 : 0);
    417             new_state = State::SetIqTable;
    418             break;
    419 
    420           case Command::SetScale:
    421             num_words = 32;
    422             new_state = State::SetScaleTable;
    423             break;
    424 
    425           default:
    426             [[unlikely]] DEV_LOG("Invalid MDEC command 0x{:08X}", cw.bits);
    427             num_words = cw.parameter_word_count.GetValue();
    428             new_state = State::NoCommand;
    429             break;
    430         }
    431 
    432         DEBUG_LOG("MDEC command: 0x{:08X} ({}, {} words in parameter, {} expected)", cw.bits,
    433                   static_cast<u8>(cw.command.GetValue()), cw.parameter_word_count.GetValue(), num_words);
    434 
    435         s_state.remaining_halfwords = num_words * 2;
    436         s_state.state = new_state;
    437         UpdateStatus();
    438         continue;
    439       }
    440 
    441       case State::DecodingMacroblock:
    442       {
    443         if (HandleDecodeMacroblockCommand())
    444         {
    445           // we should be writing out now
    446           DebugAssert(s_state.state == State::WritingMacroblock);
    447           goto finished;
    448         }
    449 
    450         if (s_state.remaining_halfwords == 0 && s_state.current_block != NUM_BLOCKS)
    451         {
    452           // expecting data, but nothing more will be coming. bail out
    453           ResetDecoder();
    454           s_state.state = State::Idle;
    455           continue;
    456         }
    457 
    458         goto finished;
    459       }
    460 
    461       case State::WritingMacroblock:
    462       {
    463         // this gets executed via the event, so if we get here, wait.
    464         goto finished;
    465       }
    466 
    467       case State::SetIqTable:
    468       {
    469         if (s_state.data_in_fifo.GetSize() < s_state.remaining_halfwords)
    470           goto finished;
    471 
    472         HandleSetQuantTableCommand();
    473         s_state.state = State::Idle;
    474         UpdateStatus();
    475         continue;
    476       }
    477 
    478       case State::SetScaleTable:
    479       {
    480         if (s_state.data_in_fifo.GetSize() < s_state.remaining_halfwords)
    481           goto finished;
    482 
    483         HandleSetScaleCommand();
    484         s_state.state = State::Idle;
    485         UpdateStatus();
    486         continue;
    487       }
    488 
    489       case State::NoCommand:
    490       {
    491         // can potentially have a large amount of halfwords, so eat them as we go
    492         const u32 words_to_consume = std::min(s_state.remaining_halfwords, s_state.data_in_fifo.GetSize());
    493         s_state.data_in_fifo.Remove(words_to_consume);
    494         s_state.remaining_halfwords -= words_to_consume;
    495         if (s_state.remaining_halfwords == 0)
    496           goto finished;
    497 
    498         s_state.state = State::Idle;
    499         UpdateStatus();
    500         continue;
    501       }
    502 
    503       default:
    504         UnreachableCode();
    505         return;
    506     }
    507   }
    508 
    509 finished:
    510   // if we get here, it's because the FIFO is now empty
    511   UpdateStatus();
    512 }
    513 
    514 bool MDEC::HandleDecodeMacroblockCommand()
    515 {
    516   if (s_state.status.data_output_depth <= DataOutputDepth_8Bit)
    517     return DecodeMonoMacroblock();
    518   else
    519     return DecodeColoredMacroblock();
    520 }
    521 
    522 bool MDEC::DecodeMonoMacroblock()
    523 {
    524   // TODO: This should guard the output not the input
    525   if (!s_state.data_out_fifo.IsEmpty())
    526     return false;
    527 
    528   if (g_settings.use_old_mdec_routines) [[unlikely]]
    529   {
    530     if (!DecodeRLE_Old(s_state.blocks[0].data(), s_state.iq_y.data()))
    531       return false;
    532 
    533     IDCT_Old(s_state.blocks[0].data());
    534   }
    535   else
    536   {
    537     if (!DecodeRLE_New(s_state.blocks[0].data(), s_state.iq_y.data()))
    538       return false;
    539 
    540     IDCT_New(s_state.blocks[0].data());
    541   }
    542 
    543   DEBUG_LOG("Decoded mono macroblock, {} words remaining", s_state.remaining_halfwords / 2);
    544   ResetDecoder();
    545   s_state.state = State::WritingMacroblock;
    546 
    547   YUVToMono(s_state.blocks[0]);
    548 
    549   ScheduleBlockCopyOut(TICKS_PER_BLOCK * 6);
    550 
    551   s_state.total_blocks_decoded++;
    552   return true;
    553 }
    554 
    555 bool MDEC::DecodeColoredMacroblock()
    556 {
    557   if (g_settings.use_old_mdec_routines) [[unlikely]]
    558   {
    559     for (; s_state.current_block < NUM_BLOCKS; s_state.current_block++)
    560     {
    561       if (!DecodeRLE_Old(s_state.blocks[s_state.current_block].data(),
    562                          (s_state.current_block >= 2) ? s_state.iq_y.data() : s_state.iq_uv.data()))
    563         return false;
    564 
    565       IDCT_Old(s_state.blocks[s_state.current_block].data());
    566     }
    567 
    568     if (!s_state.data_out_fifo.IsEmpty())
    569       return false;
    570 
    571     // done decoding
    572     DEBUG_LOG("Decoded colored macroblock, {} words remaining", s_state.remaining_halfwords / 2);
    573     ResetDecoder();
    574     s_state.state = State::WritingMacroblock;
    575 
    576     YUVToRGB_Old(0, 0, s_state.blocks[0], s_state.blocks[1], s_state.blocks[2]);
    577     YUVToRGB_Old(8, 0, s_state.blocks[0], s_state.blocks[1], s_state.blocks[3]);
    578     YUVToRGB_Old(0, 8, s_state.blocks[0], s_state.blocks[1], s_state.blocks[4]);
    579     YUVToRGB_Old(8, 8, s_state.blocks[0], s_state.blocks[1], s_state.blocks[5]);
    580   }
    581   else
    582   {
    583     for (; s_state.current_block < NUM_BLOCKS; s_state.current_block++)
    584     {
    585       if (!DecodeRLE_New(s_state.blocks[s_state.current_block].data(),
    586                          (s_state.current_block >= 2) ? s_state.iq_y.data() : s_state.iq_uv.data()))
    587         return false;
    588 
    589       IDCT_New(s_state.blocks[s_state.current_block].data());
    590     }
    591 
    592     if (!s_state.data_out_fifo.IsEmpty())
    593       return false;
    594 
    595     // done decoding
    596     DEBUG_LOG("Decoded colored macroblock, {} words remaining", s_state.remaining_halfwords / 2);
    597     ResetDecoder();
    598     s_state.state = State::WritingMacroblock;
    599 
    600     YUVToRGB_New(0, 0, s_state.blocks[0], s_state.blocks[1], s_state.blocks[2]);
    601     YUVToRGB_New(8, 0, s_state.blocks[0], s_state.blocks[1], s_state.blocks[3]);
    602     YUVToRGB_New(0, 8, s_state.blocks[0], s_state.blocks[1], s_state.blocks[4]);
    603     YUVToRGB_New(8, 8, s_state.blocks[0], s_state.blocks[1], s_state.blocks[5]);
    604   }
    605 
    606   s_state.total_blocks_decoded += 4;
    607 
    608   ScheduleBlockCopyOut(TICKS_PER_BLOCK * 6);
    609   return true;
    610 }
    611 
    612 void MDEC::ScheduleBlockCopyOut(TickCount ticks)
    613 {
    614   DebugAssert(!HasPendingBlockCopyOut());
    615   DEBUG_LOG("Scheduling block copy out in {} ticks", ticks);
    616 
    617   s_state.block_copy_out_event.SetIntervalAndSchedule(ticks);
    618 }
    619 
    620 void MDEC::CopyOutBlock(void* param, TickCount ticks, TickCount ticks_late)
    621 {
    622   Assert(s_state.state == State::WritingMacroblock);
    623   s_state.block_copy_out_event.Deactivate();
    624 
    625   switch (s_state.status.data_output_depth)
    626   {
    627     case DataOutputDepth_4Bit:
    628     {
    629       const u32* in_ptr = s_state.block_rgb.data();
    630       for (u32 i = 0; i < (64 / 8); i++)
    631       {
    632         u32 value = *(in_ptr++) >> 4;
    633         value |= (*(in_ptr++) >> 4) << 4;
    634         value |= (*(in_ptr++) >> 4) << 8;
    635         value |= (*(in_ptr++) >> 4) << 12;
    636         value |= (*(in_ptr++) >> 4) << 16;
    637         value |= (*(in_ptr++) >> 4) << 20;
    638         value |= (*(in_ptr++) >> 4) << 24;
    639         value |= (*(in_ptr++) >> 4) << 28;
    640         s_state.data_out_fifo.Push(value);
    641       }
    642     }
    643     break;
    644 
    645     case DataOutputDepth_8Bit:
    646     {
    647       const u32* in_ptr = s_state.block_rgb.data();
    648       for (u32 i = 0; i < (64 / 4); i++)
    649       {
    650         u32 value = *in_ptr++;
    651         value |= *in_ptr++ << 8;
    652         value |= *in_ptr++ << 16;
    653         value |= *in_ptr++ << 24;
    654         s_state.data_out_fifo.Push(value);
    655       }
    656     }
    657     break;
    658 
    659     case DataOutputDepth_24Bit:
    660     {
    661       // pack tightly
    662       u32 index = 0;
    663       u32 state = 0;
    664       u32 rgb = 0;
    665       while (index < s_state.block_rgb.size())
    666       {
    667         switch (state)
    668         {
    669           case 0:
    670             rgb = s_state.block_rgb[index++]; // RGB-
    671             state = 1;
    672             break;
    673           case 1:
    674             rgb |= (s_state.block_rgb[index] & 0xFF) << 24; // RGBR
    675             s_state.data_out_fifo.Push(rgb);
    676             rgb = s_state.block_rgb[index] >> 8; // GB--
    677             index++;
    678             state = 2;
    679             break;
    680           case 2:
    681             rgb |= s_state.block_rgb[index] << 16; // GBRG
    682             s_state.data_out_fifo.Push(rgb);
    683             rgb = s_state.block_rgb[index] >> 16; // B---
    684             index++;
    685             state = 3;
    686             break;
    687           case 3:
    688             rgb |= s_state.block_rgb[index] << 8; // BRGB
    689             s_state.data_out_fifo.Push(rgb);
    690             index++;
    691             state = 0;
    692             break;
    693         }
    694       }
    695       break;
    696     }
    697 
    698     case DataOutputDepth_15Bit:
    699     {
    700       if (g_settings.use_old_mdec_routines) [[unlikely]]
    701       {
    702         const u16 a = ZeroExtend16(s_state.status.data_output_bit15.GetValue()) << 15;
    703         for (u32 i = 0; i < static_cast<u32>(s_state.block_rgb.size());)
    704         {
    705           u32 color = s_state.block_rgb[i++];
    706           u16 r = Truncate16((color >> 3) & 0x1Fu);
    707           u16 g = Truncate16((color >> 11) & 0x1Fu);
    708           u16 b = Truncate16((color >> 19) & 0x1Fu);
    709           const u16 color15a = r | (g << 5) | (b << 10) | (a << 15);
    710 
    711           color = s_state.block_rgb[i++];
    712           r = Truncate16((color >> 3) & 0x1Fu);
    713           g = Truncate16((color >> 11) & 0x1Fu);
    714           b = Truncate16((color >> 19) & 0x1Fu);
    715           const u16 color15b = r | (g << 5) | (b << 10) | (a << 15);
    716 
    717           s_state.data_out_fifo.Push(ZeroExtend32(color15a) | (ZeroExtend32(color15b) << 16));
    718         }
    719       }
    720       else
    721       {
    722         const u32 a = ZeroExtend32(s_state.status.data_output_bit15.GetValue()) << 15;
    723         for (u32 i = 0; i < static_cast<u32>(s_state.block_rgb.size());)
    724         {
    725 #define E8TO5(color) (std::min<u32>((((color) + 4) >> 3), 0x1F))
    726           u32 color = s_state.block_rgb[i++];
    727           u32 r = E8TO5(color & 0xFFu);
    728           u32 g = E8TO5((color >> 8) & 0xFFu);
    729           u32 b = E8TO5((color >> 16) & 0xFFu);
    730           const u32 color15a = r | (g << 5) | (b << 10) | a;
    731 
    732           color = s_state.block_rgb[i++];
    733           r = E8TO5(color & 0xFFu);
    734           g = E8TO5((color >> 8) & 0xFFu);
    735           b = E8TO5((color >> 16) & 0xFFu);
    736           const u32 color15b = r | (g << 5) | (b << 10) | a;
    737 #undef E8TO5
    738 
    739           s_state.data_out_fifo.Push(color15a | (color15b << 16));
    740         }
    741       }
    742     }
    743     break;
    744 
    745     default:
    746       break;
    747   }
    748 
    749   DEBUG_LOG("Block copied out, fifo size = {} ({} bytes)", s_state.data_out_fifo.GetSize(),
    750             s_state.data_out_fifo.GetSize() * sizeof(u32));
    751 
    752   // if we've copied out all blocks, command is complete
    753   s_state.state = (s_state.remaining_halfwords == 0) ? State::Idle : State::DecodingMacroblock;
    754   Execute();
    755 }
    756 
    757 bool MDEC::DecodeRLE_Old(s16* blk, const u8* qt)
    758 {
    759   static constexpr std::array<u8, 64> zagzig = {{0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
    760                                                  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
    761                                                  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
    762                                                  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63}};
    763 
    764   if (s_state.current_coefficient == 64)
    765   {
    766     std::fill_n(blk, 64, s16(0));
    767 
    768     // skip padding at start
    769     u16 n;
    770     for (;;)
    771     {
    772       if (s_state.data_in_fifo.IsEmpty() || s_state.remaining_halfwords == 0)
    773         return false;
    774 
    775       n = s_state.data_in_fifo.Pop();
    776       s_state.remaining_halfwords--;
    777 
    778       if (n == 0xFE00)
    779         continue;
    780       else
    781         break;
    782     }
    783 
    784     s_state.current_coefficient = 0;
    785     s_state.current_q_scale = (n >> 10) & 0x3F;
    786     s32 val = SignExtendN<10, s32>(static_cast<s32>(n & 0x3FF)) *
    787               static_cast<s32>(ZeroExtend32(qt[s_state.current_coefficient]));
    788 
    789     if (s_state.current_q_scale == 0)
    790       val = SignExtendN<10, s32>(static_cast<s32>(n & 0x3FF)) * 2;
    791 
    792     val = std::clamp(val, -0x400, 0x3FF);
    793     if (s_state.current_q_scale > 0)
    794       blk[zagzig[s_state.current_coefficient]] = static_cast<s16>(val);
    795     else
    796       blk[s_state.current_coefficient] = static_cast<s16>(val);
    797   }
    798 
    799   while (!s_state.data_in_fifo.IsEmpty() && s_state.remaining_halfwords > 0)
    800   {
    801     u16 n = s_state.data_in_fifo.Pop();
    802     s_state.remaining_halfwords--;
    803 
    804     s_state.current_coefficient += ((n >> 10) & 0x3F) + 1;
    805     if (s_state.current_coefficient < 64)
    806     {
    807       s32 val =
    808         (SignExtendN<10, s32>(static_cast<s32>(n & 0x3FF)) *
    809            static_cast<s32>(ZeroExtend32(qt[s_state.current_coefficient])) * static_cast<s32>(s_state.current_q_scale) +
    810          4) /
    811         8;
    812 
    813       if (s_state.current_q_scale == 0)
    814         val = SignExtendN<10, s32>(static_cast<s32>(n & 0x3FF)) * 2;
    815 
    816       val = std::clamp(val, -0x400, 0x3FF);
    817       if (s_state.current_q_scale > 0)
    818         blk[zagzig[s_state.current_coefficient]] = static_cast<s16>(val);
    819       else
    820         blk[s_state.current_coefficient] = static_cast<s16>(val);
    821     }
    822 
    823     if (s_state.current_coefficient >= 63)
    824     {
    825       s_state.current_coefficient = 64;
    826       return true;
    827     }
    828   }
    829 
    830   return false;
    831 }
    832 
    833 void MDEC::IDCT_Old(s16* blk)
    834 {
    835   std::array<s64, 64> temp_buffer;
    836   for (u32 x = 0; x < 8; x++)
    837   {
    838     for (u32 y = 0; y < 8; y++)
    839     {
    840       s64 sum = 0;
    841       for (u32 u = 0; u < 8; u++)
    842         sum += s32(blk[u * 8 + x]) * s32(s_state.scale_table[y * 8 + u]);
    843       temp_buffer[x + y * 8] = sum;
    844     }
    845   }
    846   for (u32 x = 0; x < 8; x++)
    847   {
    848     for (u32 y = 0; y < 8; y++)
    849     {
    850       s64 sum = 0;
    851       for (u32 u = 0; u < 8; u++)
    852         sum += s64(temp_buffer[u + y * 8]) * s32(s_state.scale_table[x * 8 + u]);
    853 
    854       blk[x + y * 8] =
    855         static_cast<s16>(std::clamp<s32>(SignExtendN<9, s32>((sum >> 32) + ((sum >> 31) & 1)), -128, 127));
    856     }
    857   }
    858 }
    859 
    860 void MDEC::YUVToRGB_Old(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk,
    861                         const std::array<s16, 64>& Yblk)
    862 {
    863   const s16 addval = s_state.status.data_output_signed ? 0 : 0x80;
    864   for (u32 y = 0; y < 8; y++)
    865   {
    866     for (u32 x = 0; x < 8; x++)
    867     {
    868       s16 R = Crblk[((x + xx) / 2) + ((y + yy) / 2) * 8];
    869       s16 B = Cbblk[((x + xx) / 2) + ((y + yy) / 2) * 8];
    870       s16 G = static_cast<s16>((-0.3437f * static_cast<float>(B)) + (-0.7143f * static_cast<float>(R)));
    871 
    872       R = static_cast<s16>(1.402f * static_cast<float>(R));
    873       B = static_cast<s16>(1.772f * static_cast<float>(B));
    874 
    875       s16 Y = Yblk[x + y * 8];
    876       R = static_cast<s16>(std::clamp(static_cast<int>(Y) + R, -128, 127)) + addval;
    877       G = static_cast<s16>(std::clamp(static_cast<int>(Y) + G, -128, 127)) + addval;
    878       B = static_cast<s16>(std::clamp(static_cast<int>(Y) + B, -128, 127)) + addval;
    879 
    880       s_state.block_rgb[(x + xx) + ((y + yy) * 16)] = ZeroExtend32(static_cast<u16>(R)) |
    881                                                       (ZeroExtend32(static_cast<u16>(G)) << 8) |
    882                                                       (ZeroExtend32(static_cast<u16>(B)) << 16);
    883     }
    884   }
    885 }
    886 
    887 bool MDEC::DecodeRLE_New(s16* blk, const u8* qt)
    888 {
    889   // Swapped to row-major so we can vectorize the IDCT.
    890   static constexpr std::array<u8, 64> zigzag = {{0,  8,  1,  2,  9,  16, 24, 17, 10, 3,  4,  11, 18, 25, 32, 40,
    891                                                  33, 26, 19, 12, 5,  6,  13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
    892                                                  28, 21, 14, 7,  15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
    893                                                  23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63}};
    894 
    895   if (s_state.current_coefficient == 64)
    896   {
    897     std::fill_n(blk, 64, s16(0));
    898 
    899     // skip padding at start
    900     u16 n;
    901     for (;;)
    902     {
    903       if (s_state.data_in_fifo.IsEmpty() || s_state.remaining_halfwords == 0)
    904         return false;
    905 
    906       n = s_state.data_in_fifo.Pop();
    907       s_state.remaining_halfwords--;
    908 
    909       if (n == 0xFE00)
    910         continue;
    911       else
    912         break;
    913     }
    914 
    915     s_state.current_coefficient = 0;
    916     s_state.current_q_scale = n >> 10;
    917 
    918     // Store the DCT blocks with an additional 4 bits of precision.
    919     const s32 val = SignExtendN<10, s32>(static_cast<s32>(n));
    920     const s32 coeff =
    921       (s_state.current_q_scale == 0) ? (val << 5) : (((val * qt[0]) << 4) + (val ? ((val < 0) ? 8 : -8) : 0));
    922     blk[zigzag[0]] = static_cast<s16>(std::clamp(coeff, -0x4000, 0x3FFF));
    923   }
    924 
    925   while (!s_state.data_in_fifo.IsEmpty() && s_state.remaining_halfwords > 0)
    926   {
    927     u16 n = s_state.data_in_fifo.Pop();
    928     s_state.remaining_halfwords--;
    929 
    930     s_state.current_coefficient += ((n >> 10) + 1);
    931     if (s_state.current_coefficient < 64)
    932     {
    933       const s32 val = SignExtendN<10, s32>(n);
    934       const s32 scq = static_cast<s32>(s_state.current_q_scale * qt[s_state.current_coefficient]);
    935       const s32 coeff = (scq == 0) ? (val << 5) : ((((val * scq) >> 3) << 4) + (val ? ((val < 0) ? 8 : -8) : 0));
    936       blk[zigzag[s_state.current_coefficient]] = static_cast<s16>(std::clamp(coeff, -0x4000, 0x3FFF));
    937     }
    938 
    939     if (s_state.current_coefficient >= 63)
    940     {
    941       s_state.current_coefficient = 64;
    942       return true;
    943     }
    944   }
    945 
    946   return false;
    947 }
    948 
    949 static s16 IDCTRow(const s16* blk, const s16* idct_matrix)
    950 {
    951   // IDCT matrix is -32768..32767, block is -16384..16383. 4 adds can happen without overflow.
    952   GSVector4i sum = GSVector4i::load<false>(blk).madd_s16(GSVector4i::load<true>(idct_matrix)).addp_s32();
    953   return static_cast<s16>(((static_cast<s64>(sum.extract32<0>()) + static_cast<s64>(sum.extract32<1>())) + 0x20000) >>
    954                           18);
    955 }
    956 
    957 void MDEC::IDCT_New(s16* blk)
    958 {
    959   alignas(VECTOR_ALIGNMENT) std::array<s16, 64> temp;
    960   for (u32 x = 0; x < 8; x++)
    961   {
    962     for (u32 y = 0; y < 8; y++)
    963       temp[y * 8 + x] = IDCTRow(&blk[x * 8], &s_state.scale_table[y * 8]);
    964   }
    965   for (u32 x = 0; x < 8; x++)
    966   {
    967     for (u32 y = 0; y < 8; y++)
    968     {
    969       const s32 sum = IDCTRow(&temp[x * 8], &s_state.scale_table[y * 8]);
    970       blk[x * 8 + y] = static_cast<s16>(std::clamp(SignExtendN<9, s32>(sum), -128, 127));
    971     }
    972   }
    973 }
    974 
    975 void MDEC::YUVToRGB_New(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk,
    976                         const std::array<s16, 64>& Yblk)
    977 {
    978   const GSVector4i addval = s_state.status.data_output_signed ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
    979   for (u32 y = 0; y < 8; y++)
    980   {
    981     const GSVector4i Cr = GSVector4i::loadl(&Crblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
    982     const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(xx / 2) + ((y + yy) / 2) * 8]).s16to32();
    983     const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);
    984 
    985     // BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
    986     // r = clamp(sext9(Y + (((359 * Cr) + 0x80) >> 8)), -128, 127) + addval;
    987     // g = clamp(sext9(Y + ((((-88 * Cb) & ~0x1F) + ((-183 * Cr) & ~0x07) + 0x80) >> 8)), -128, 127) + addval
    988     // b = clamp(sext9<9, s32>(Y + (((454 * Cb) + 0x80) >> 8)), -128, 127) + addval
    989 
    990     // Need to do the multiply as 32-bit, since 127 * 359 is greater than INT16_MAX.
    991     // upl16(self) = interleave XYZW0000 -> XXYYZZWW.
    992     const GSVector4i Crmul = Cr.mul32l(GSVector4i::cxpr(359)).add16(GSVector4i::cxpr(0x80)).sra32<8>().ps32();
    993     const GSVector4i Cbmul = Cb.mul32l(GSVector4i::cxpr(454)).add16(GSVector4i::cxpr(0x80)).sra32<8>().ps32();
    994     const GSVector4i CrCbmul = (Cb.mul32l(GSVector4i::cxpr(-88)) & GSVector4i::cxpr(~0x1F))
    995                                  .add32(Cr.mul32l(GSVector4i::cxpr(-183)) & GSVector4i::cxpr(~0x07))
    996                                  .add32(GSVector4i::cxpr(0x80))
    997                                  .sra32<8>()
    998                                  .ps32();
    999     const GSVector4i r = Crmul.upl16(Crmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval);
   1000     const GSVector4i g = CrCbmul.upl16(CrCbmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval);
   1001     const GSVector4i b = Cbmul.upl16(Cbmul).add16(Y).sll16<7>().sra16<7>().ps16().add8(addval);
   1002     const GSVector4i rg = r.upl8(g);
   1003     const GSVector4i b0 = b.upl8();
   1004     const GSVector4i rgblow = rg.upl16(b0);
   1005     const GSVector4i rgbhigh = rg.uph16(b0);
   1006 
   1007     u32* const out_row = &s_state.block_rgb[xx + ((y + yy) * 16)];
   1008     GSVector4i::store<false>(&out_row[0], rgblow);
   1009     GSVector4i::store<false>(&out_row[4], rgbhigh);
   1010   }
   1011 }
   1012 
   1013 void MDEC::YUVToMono(const std::array<s16, 64>& Yblk)
   1014 {
   1015   const s32 addval = s_state.status.data_output_signed ? 0 : 0x80;
   1016   for (u32 i = 0; i < 64; i++)
   1017     s_state.block_rgb[i] = static_cast<u32>(std::clamp(SignExtendN<9, s32>(Yblk[i]), -128, 127) + addval);
   1018 }
   1019 
   1020 void MDEC::HandleSetQuantTableCommand()
   1021 {
   1022   DebugAssert(s_state.remaining_halfwords >= 32);
   1023 
   1024   // TODO: Remove extra copies..
   1025   std::array<u16, 32> packed_data;
   1026   s_state.data_in_fifo.PopRange(packed_data.data(), static_cast<u32>(packed_data.size()));
   1027   s_state.remaining_halfwords -= 32;
   1028   std::memcpy(s_state.iq_y.data(), packed_data.data(), s_state.iq_y.size());
   1029 
   1030   if (s_state.remaining_halfwords > 0)
   1031   {
   1032     DebugAssert(s_state.remaining_halfwords >= 32);
   1033 
   1034     s_state.data_in_fifo.PopRange(packed_data.data(), static_cast<u32>(packed_data.size()));
   1035     std::memcpy(s_state.iq_uv.data(), packed_data.data(), s_state.iq_uv.size());
   1036   }
   1037 }
   1038 
   1039 void MDEC::HandleSetScaleCommand()
   1040 {
   1041   DebugAssert(s_state.remaining_halfwords == 64);
   1042 
   1043   std::array<u16, 64> packed_data;
   1044   s_state.data_in_fifo.PopRange(packed_data.data(), static_cast<u32>(packed_data.size()));
   1045   s_state.remaining_halfwords -= 32;
   1046   SetScaleMatrix(packed_data.data());
   1047 }
   1048 
   1049 void MDEC::SetScaleMatrix(const u16* values)
   1050 {
   1051   for (u32 y = 0; y < 8; y++)
   1052   {
   1053     for (u32 x = 0; x < 8; x++)
   1054       s_state.scale_table[y * 8 + x] = values[x * 8 + y];
   1055   }
   1056 }
   1057 
   1058 void MDEC::DrawDebugStateWindow()
   1059 {
   1060   const float framebuffer_scale = ImGuiManager::GetGlobalScale();
   1061 
   1062   ImGui::SetNextWindowSize(ImVec2(300.0f * framebuffer_scale, 350.0f * framebuffer_scale), ImGuiCond_FirstUseEver);
   1063   if (!ImGui::Begin("MDEC State", nullptr))
   1064   {
   1065     ImGui::End();
   1066     return;
   1067   }
   1068 
   1069   static constexpr std::array<const char*, 5> state_names = {
   1070     {"None", "Decoding Macroblock", "Writing Macroblock", "SetIqTab", "SetScale"}};
   1071   static constexpr std::array<const char*, 4> output_depths = {{"4-bit", "8-bit", "24-bit", "15-bit"}};
   1072   static constexpr std::array<const char*, 7> block_names = {{"Crblk", "Cbblk", "Y1", "Y2", "Y3", "Y4", "Output"}};
   1073 
   1074   ImGui::Text("Blocks Decoded: %u", s_state.total_blocks_decoded);
   1075   ImGui::Text("Data-In FIFO Size: %u (%u bytes)", s_state.data_in_fifo.GetSize(), s_state.data_in_fifo.GetSize() * 4);
   1076   ImGui::Text("Data-Out FIFO Size: %u (%u bytes)", s_state.data_out_fifo.GetSize(),
   1077               s_state.data_out_fifo.GetSize() * 4);
   1078   ImGui::Text("DMA Enable: %s%s", s_state.enable_dma_in ? "In " : "", s_state.enable_dma_out ? "Out" : "");
   1079   ImGui::Text("Current State: %s", state_names[static_cast<u8>(s_state.state)]);
   1080   ImGui::Text("Current Block: %s", block_names[s_state.current_block]);
   1081   ImGui::Text("Current Coefficient: %u", s_state.current_coefficient);
   1082 
   1083   if (ImGui::CollapsingHeader("Status", ImGuiTreeNodeFlags_DefaultOpen))
   1084   {
   1085     ImGui::Text("Data-Out FIFO Empty: %s", s_state.status.data_out_fifo_empty ? "Yes" : "No");
   1086     ImGui::Text("Data-In FIFO Full: %s", s_state.status.data_in_fifo_full ? "Yes" : "No");
   1087     ImGui::Text("Command Busy: %s", s_state.status.command_busy ? "Yes" : "No");
   1088     ImGui::Text("Data-In Request: %s", s_state.status.data_in_request ? "Yes" : "No");
   1089     ImGui::Text("Output Depth: %s", output_depths[static_cast<u8>(s_state.status.data_output_depth.GetValue())]);
   1090     ImGui::Text("Output Signed: %s", s_state.status.data_output_signed ? "Yes" : "No");
   1091     ImGui::Text("Output Bit 15: %u", ZeroExtend32(s_state.status.data_output_bit15.GetValue()));
   1092     ImGui::Text("Current Block: %u", ZeroExtend32(s_state.status.current_block.GetValue()));
   1093     ImGui::Text("Parameter Words Remaining: %d",
   1094                 static_cast<s32>(SignExtend32(s_state.status.parameter_words_remaining.GetValue())));
   1095   }
   1096 
   1097   ImGui::End();
   1098 }