duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

gpu_sw.cpp (28092B)


      1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "gpu_sw.h"
      5 #include "system.h"
      6 
      7 #include "util/gpu_device.h"
      8 
      9 #include "common/align.h"
     10 #include "common/assert.h"
     11 #include "common/gsvector.h"
     12 #include "common/gsvector_formatter.h"
     13 #include "common/log.h"
     14 
     15 #include <algorithm>
     16 
     17 Log_SetChannel(GPU_SW);
     18 
     19 GPU_SW::GPU_SW() = default;
     20 
     21 GPU_SW::~GPU_SW()
     22 {
     23   g_gpu_device->RecycleTexture(std::move(m_upload_texture));
     24   m_backend.Shutdown();
     25 }
     26 
     27 const Threading::Thread* GPU_SW::GetSWThread() const
     28 {
     29   return m_backend.GetThread();
     30 }
     31 
     32 bool GPU_SW::IsHardwareRenderer() const
     33 {
     34   return false;
     35 }
     36 
     37 bool GPU_SW::Initialize()
     38 {
     39   if (!GPU::Initialize() || !m_backend.Initialize(false))
     40     return false;
     41 
     42   static constexpr const std::array formats_for_16bit = {GPUTexture::Format::RGB565, GPUTexture::Format::RGBA5551,
     43                                                          GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8};
     44   static constexpr const std::array formats_for_24bit = {GPUTexture::Format::RGBA8, GPUTexture::Format::BGRA8,
     45                                                          GPUTexture::Format::RGB565, GPUTexture::Format::RGBA5551};
     46   for (const GPUTexture::Format format : formats_for_16bit)
     47   {
     48     if (g_gpu_device->SupportsTextureFormat(format))
     49     {
     50       m_16bit_display_format = format;
     51       break;
     52     }
     53   }
     54   for (const GPUTexture::Format format : formats_for_24bit)
     55   {
     56     if (g_gpu_device->SupportsTextureFormat(format))
     57     {
     58       m_24bit_display_format = format;
     59       break;
     60     }
     61   }
     62 
     63   return true;
     64 }
     65 
     66 bool GPU_SW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display)
     67 {
     68   // need to ensure the worker thread is done
     69   m_backend.Sync(true);
     70 
     71   // ignore the host texture for software mode, since we want to save vram here
     72   return GPU::DoState(sw, nullptr, update_display);
     73 }
     74 
     75 void GPU_SW::Reset(bool clear_vram)
     76 {
     77   GPU::Reset(clear_vram);
     78 
     79   m_backend.Reset();
     80 }
     81 
     82 void GPU_SW::UpdateSettings(const Settings& old_settings)
     83 {
     84   GPU::UpdateSettings(old_settings);
     85   m_backend.UpdateSettings();
     86 }
     87 
     88 GPUTexture* GPU_SW::GetDisplayTexture(u32 width, u32 height, GPUTexture::Format format)
     89 {
     90   if (!m_upload_texture || m_upload_texture->GetWidth() != width || m_upload_texture->GetHeight() != height ||
     91       m_upload_texture->GetFormat() != format)
     92   {
     93     ClearDisplayTexture();
     94     g_gpu_device->RecycleTexture(std::move(m_upload_texture));
     95     m_upload_texture =
     96       g_gpu_device->FetchTexture(width, height, 1, 1, 1, GPUTexture::Type::DynamicTexture, format, nullptr, 0);
     97     if (!m_upload_texture) [[unlikely]]
     98       ERROR_LOG("Failed to create {}x{} {} texture", width, height, static_cast<u32>(format));
     99   }
    100 
    101   return m_upload_texture.get();
    102 }
    103 
    104 template<GPUTexture::Format out_format, typename out_type>
    105 static void CopyOutRow16(const u16* src_ptr, out_type* dst_ptr, u32 width);
    106 
    107 template<GPUTexture::Format out_format, typename out_type>
    108 static out_type VRAM16ToOutput(u16 value);
    109 
    110 template<>
    111 ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGBA5551, u16>(u16 value)
    112 {
    113   return (value & 0x3E0) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 10);
    114 }
    115 
    116 template<>
    117 ALWAYS_INLINE u16 VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(u16 value)
    118 {
    119   return ((value & 0x3E0) << 1) | ((value & 0x20) << 1) | ((value >> 10) & 0x1F) | ((value & 0x1F) << 11);
    120 }
    121 
    122 template<>
    123 ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(u16 value)
    124 {
    125   const u32 value32 = ZeroExtend32(value);
    126   const u32 r = (value32 & 31u) << 3;
    127   const u32 g = ((value32 >> 5) & 31u) << 3;
    128   const u32 b = ((value32 >> 10) & 31u) << 3;
    129   const u32 a = ((value >> 15) != 0) ? 255 : 0;
    130   return ZeroExtend32(r) | (ZeroExtend32(g) << 8) | (ZeroExtend32(b) << 16) | (ZeroExtend32(a) << 24);
    131 }
    132 
    133 template<>
    134 ALWAYS_INLINE u32 VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(u16 value)
    135 {
    136   const u32 value32 = ZeroExtend32(value);
    137   const u32 r = (value32 & 31u) << 3;
    138   const u32 g = ((value32 >> 5) & 31u) << 3;
    139   const u32 b = ((value32 >> 10) & 31u) << 3;
    140   return ZeroExtend32(b) | (ZeroExtend32(g) << 8) | (ZeroExtend32(r) << 16) | (0xFF000000u);
    141 }
    142 
    143 template<>
    144 ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGBA5551, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
    145 {
    146   u32 col = 0;
    147 
    148   const u32 aligned_width = Common::AlignDownPow2(width, 8);
    149   for (; col < aligned_width; col += 8)
    150   {
    151     constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
    152     GSVector4i value = GSVector4i::load<false>(src_ptr);
    153     src_ptr += 8;
    154     GSVector4i a = value & GSVector4i::cxpr16(0x3E0);
    155     GSVector4i b = value.srl16<10>() & single_mask;
    156     GSVector4i c = (value & single_mask).sll16<10>();
    157     value = (a | b) | c;
    158     GSVector4i::store<false>(dst_ptr, value);
    159     dst_ptr += 8;
    160   }
    161 
    162   for (; col < width; col++)
    163     *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGBA5551, u16>(*(src_ptr++));
    164 }
    165 
    166 template<>
    167 ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGB565, u16>(const u16* src_ptr, u16* dst_ptr, u32 width)
    168 {
    169   u32 col = 0;
    170 
    171   const u32 aligned_width = Common::AlignDownPow2(width, 8);
    172   for (; col < aligned_width; col += 8)
    173   {
    174     constexpr GSVector4i single_mask = GSVector4i::cxpr16(0x1F);
    175     GSVector4i value = GSVector4i::load<false>(src_ptr);
    176     src_ptr += 8;
    177     GSVector4i a = (value & GSVector4i::cxpr16(0x3E0)).sll16<1>(); // (value & 0x3E0) << 1
    178     GSVector4i b = (value & GSVector4i::cxpr16(0x20)).sll16<1>();  // (value & 0x20) << 1
    179     GSVector4i c = (value.srl16<10>() & single_mask);              // ((value >> 10) & 0x1F)
    180     GSVector4i d = (value & single_mask).sll16<11>();              // ((value & 0x1F) << 11)
    181     value = (((a | b) | c) | d);
    182     GSVector4i::store<false>(dst_ptr, value);
    183     dst_ptr += 8;
    184   }
    185 
    186   for (; col < width; col++)
    187     *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGB565, u16>(*(src_ptr++));
    188 }
    189 
    190 template<>
    191 ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::RGBA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
    192 {
    193   for (u32 col = 0; col < width; col++)
    194     *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::RGBA8, u32>(*(src_ptr++));
    195 }
    196 
    197 template<>
    198 ALWAYS_INLINE void CopyOutRow16<GPUTexture::Format::BGRA8, u32>(const u16* src_ptr, u32* dst_ptr, u32 width)
    199 {
    200   for (u32 col = 0; col < width; col++)
    201     *(dst_ptr++) = VRAM16ToOutput<GPUTexture::Format::BGRA8, u32>(*(src_ptr++));
    202 }
    203 
    204 template<GPUTexture::Format display_format>
    205 ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 line_skip)
    206 {
    207   using OutputPixelType =
    208     std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
    209                        u16>;
    210 
    211   GPUTexture* texture = GetDisplayTexture(width, height, display_format);
    212   if (!texture) [[unlikely]]
    213     return false;
    214 
    215   u32 dst_stride = width * sizeof(OutputPixelType);
    216   u8* dst_ptr = m_upload_buffer.data();
    217   const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);
    218 
    219   // Fast path when not wrapping around.
    220   if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
    221   {
    222     const u16* src_ptr = &g_vram[src_y * VRAM_WIDTH + src_x];
    223     const u32 src_step = VRAM_WIDTH << line_skip;
    224     for (u32 row = 0; row < height; row++)
    225     {
    226       CopyOutRow16<display_format>(src_ptr, reinterpret_cast<OutputPixelType*>(dst_ptr), width);
    227       src_ptr += src_step;
    228       dst_ptr += dst_stride;
    229     }
    230   }
    231   else
    232   {
    233     const u32 end_x = src_x + width;
    234     const u32 y_step = (1 << line_skip);
    235     for (u32 row = 0; row < height; row++)
    236     {
    237       const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
    238       OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
    239 
    240       for (u32 col = src_x; col < end_x; col++)
    241         *(dst_row_ptr++) = VRAM16ToOutput<display_format, OutputPixelType>(src_row_ptr[col % VRAM_WIDTH]);
    242 
    243       src_y += y_step;
    244       dst_ptr += dst_stride;
    245     }
    246   }
    247 
    248   if (mapped)
    249     texture->Unmap();
    250   else
    251     texture->Update(0, 0, width, height, m_upload_buffer.data(), dst_stride);
    252 
    253   return true;
    254 }
    255 
    256 template<GPUTexture::Format display_format>
    257 ALWAYS_INLINE_RELEASE bool GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip)
    258 {
    259   using OutputPixelType =
    260     std::conditional_t<display_format == GPUTexture::Format::RGBA8 || display_format == GPUTexture::Format::BGRA8, u32,
    261                        u16>;
    262 
    263   GPUTexture* texture = GetDisplayTexture(width, height, display_format);
    264   if (!texture) [[unlikely]]
    265     return false;
    266 
    267   u32 dst_stride = Common::AlignUpPow2<u32>(width * sizeof(OutputPixelType), 4);
    268   u8* dst_ptr = m_upload_buffer.data();
    269   const bool mapped = texture->Map(reinterpret_cast<void**>(&dst_ptr), &dst_stride, 0, 0, width, height);
    270 
    271   if ((src_x + width) <= VRAM_WIDTH && (src_y + (height << line_skip)) <= VRAM_HEIGHT)
    272   {
    273     const u8* src_ptr = reinterpret_cast<const u8*>(&g_vram[src_y * VRAM_WIDTH + src_x]) + (skip_x * 3);
    274     const u32 src_stride = (VRAM_WIDTH << line_skip) * sizeof(u16);
    275     for (u32 row = 0; row < height; row++)
    276     {
    277       if constexpr (display_format == GPUTexture::Format::RGBA8)
    278       {
    279         const u8* src_row_ptr = src_ptr;
    280         u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
    281         for (u32 col = 0; col < width; col++)
    282         {
    283           *(dst_row_ptr++) = *(src_row_ptr++);
    284           *(dst_row_ptr++) = *(src_row_ptr++);
    285           *(dst_row_ptr++) = *(src_row_ptr++);
    286           *(dst_row_ptr++) = 0xFF;
    287         }
    288       }
    289       else if constexpr (display_format == GPUTexture::Format::BGRA8)
    290       {
    291         const u8* src_row_ptr = src_ptr;
    292         u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
    293         for (u32 col = 0; col < width; col++)
    294         {
    295           *(dst_row_ptr++) = src_row_ptr[2];
    296           *(dst_row_ptr++) = src_row_ptr[1];
    297           *(dst_row_ptr++) = src_row_ptr[0];
    298           *(dst_row_ptr++) = 0xFF;
    299           src_row_ptr += 3;
    300         }
    301       }
    302       else if constexpr (display_format == GPUTexture::Format::RGB565)
    303       {
    304         const u8* src_row_ptr = src_ptr;
    305         u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
    306         for (u32 col = 0; col < width; col++)
    307         {
    308           *(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 11) |
    309                              ((static_cast<u16>(src_row_ptr[1]) >> 2) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
    310           src_row_ptr += 3;
    311         }
    312       }
    313       else if constexpr (display_format == GPUTexture::Format::RGBA5551)
    314       {
    315         const u8* src_row_ptr = src_ptr;
    316         u16* dst_row_ptr = reinterpret_cast<u16*>(dst_ptr);
    317         for (u32 col = 0; col < width; col++)
    318         {
    319           *(dst_row_ptr++) = ((static_cast<u16>(src_row_ptr[0]) >> 3) << 10) |
    320                              ((static_cast<u16>(src_row_ptr[1]) >> 3) << 5) | (static_cast<u16>(src_row_ptr[2]) >> 3);
    321           src_row_ptr += 3;
    322         }
    323       }
    324 
    325       src_ptr += src_stride;
    326       dst_ptr += dst_stride;
    327     }
    328   }
    329   else
    330   {
    331     const u32 y_step = (1 << line_skip);
    332 
    333     for (u32 row = 0; row < height; row++)
    334     {
    335       const u16* src_row_ptr = &g_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
    336       OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
    337 
    338       for (u32 col = 0; col < width; col++)
    339       {
    340         const u32 offset = (src_x + (((skip_x + col) * 3) / 2));
    341         const u16 s0 = src_row_ptr[offset % VRAM_WIDTH];
    342         const u16 s1 = src_row_ptr[(offset + 1) % VRAM_WIDTH];
    343         const u8 shift = static_cast<u8>(col & 1u) * 8;
    344         const u32 rgb = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift);
    345 
    346         if constexpr (display_format == GPUTexture::Format::RGBA8)
    347         {
    348           *(dst_row_ptr++) = rgb | 0xFF000000u;
    349         }
    350         else if constexpr (display_format == GPUTexture::Format::BGRA8)
    351         {
    352           *(dst_row_ptr++) = (rgb & 0x00FF00) | ((rgb & 0xFF) << 16) | ((rgb >> 16) & 0xFF) | 0xFF000000u;
    353         }
    354         else if constexpr (display_format == GPUTexture::Format::RGB565)
    355         {
    356           *(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 10) << 5) & 0x7E0) | (((rgb >> 19) << 11) & 0x3E0000);
    357         }
    358         else if constexpr (display_format == GPUTexture::Format::RGBA5551)
    359         {
    360           *(dst_row_ptr++) = ((rgb >> 3) & 0x1F) | (((rgb >> 11) << 5) & 0x3E0) | (((rgb >> 19) << 10) & 0x1F0000);
    361         }
    362       }
    363 
    364       src_y += y_step;
    365       dst_ptr += dst_stride;
    366     }
    367   }
    368 
    369   if (mapped)
    370     texture->Unmap();
    371   else
    372     texture->Update(0, 0, width, height, m_upload_buffer.data(), dst_stride);
    373 
    374   return true;
    375 }
    376 
    377 bool GPU_SW::CopyOut(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 height, u32 line_skip, bool is_24bit)
    378 {
    379   if (!is_24bit)
    380   {
    381     DebugAssert(skip_x == 0);
    382 
    383     switch (m_16bit_display_format)
    384     {
    385       case GPUTexture::Format::RGBA5551:
    386         return CopyOut15Bit<GPUTexture::Format::RGBA5551>(src_x, src_y, width, height, line_skip);
    387 
    388       case GPUTexture::Format::RGB565:
    389         return CopyOut15Bit<GPUTexture::Format::RGB565>(src_x, src_y, width, height, line_skip);
    390 
    391       case GPUTexture::Format::RGBA8:
    392         return CopyOut15Bit<GPUTexture::Format::RGBA8>(src_x, src_y, width, height, line_skip);
    393 
    394       case GPUTexture::Format::BGRA8:
    395         return CopyOut15Bit<GPUTexture::Format::BGRA8>(src_x, src_y, width, height, line_skip);
    396 
    397       default:
    398         UnreachableCode();
    399     }
    400   }
    401   else
    402   {
    403     switch (m_24bit_display_format)
    404     {
    405       case GPUTexture::Format::RGBA5551:
    406         return CopyOut24Bit<GPUTexture::Format::RGBA5551>(src_x, src_y, skip_x, width, height, line_skip);
    407 
    408       case GPUTexture::Format::RGB565:
    409         return CopyOut24Bit<GPUTexture::Format::RGB565>(src_x, src_y, skip_x, width, height, line_skip);
    410 
    411       case GPUTexture::Format::RGBA8:
    412         return CopyOut24Bit<GPUTexture::Format::RGBA8>(src_x, src_y, skip_x, width, height, line_skip);
    413 
    414       case GPUTexture::Format::BGRA8:
    415         return CopyOut24Bit<GPUTexture::Format::BGRA8>(src_x, src_y, skip_x, width, height, line_skip);
    416 
    417       default:
    418         UnreachableCode();
    419     }
    420   }
    421 }
    422 
    423 void GPU_SW::UpdateDisplay()
    424 {
    425   // fill display texture
    426   m_backend.Sync(true);
    427 
    428   if (!g_settings.debugging.show_vram)
    429   {
    430     if (IsDisplayDisabled())
    431     {
    432       ClearDisplayTexture();
    433       return;
    434     }
    435 
    436     const bool is_24bit = m_GPUSTAT.display_area_color_depth_24;
    437     const bool interlaced = IsInterlacedDisplayEnabled();
    438     const u32 field = GetInterlacedDisplayField();
    439     const u32 vram_offset_x = is_24bit ? m_crtc_state.regs.X : m_crtc_state.display_vram_left;
    440     const u32 vram_offset_y =
    441       m_crtc_state.display_vram_top + ((interlaced && m_GPUSTAT.vertical_resolution) ? field : 0);
    442     const u32 skip_x = is_24bit ? (m_crtc_state.display_vram_left - m_crtc_state.regs.X) : 0;
    443     const u32 read_width = m_crtc_state.display_vram_width;
    444     const u32 read_height = interlaced ? (m_crtc_state.display_vram_height / 2) : m_crtc_state.display_vram_height;
    445 
    446     if (IsInterlacedDisplayEnabled())
    447     {
    448       const u32 line_skip = m_GPUSTAT.vertical_resolution;
    449       if (CopyOut(vram_offset_x, vram_offset_y, skip_x, read_width, read_height, line_skip, is_24bit))
    450       {
    451         SetDisplayTexture(m_upload_texture.get(), nullptr, 0, 0, read_width, read_height);
    452         if (is_24bit && g_settings.display_24bit_chroma_smoothing)
    453         {
    454           if (ApplyChromaSmoothing())
    455             Deinterlace(field, 0);
    456         }
    457         else
    458         {
    459           Deinterlace(field, 0);
    460         }
    461       }
    462     }
    463     else
    464     {
    465       if (CopyOut(vram_offset_x, vram_offset_y, skip_x, read_width, read_height, 0, is_24bit))
    466       {
    467         SetDisplayTexture(m_upload_texture.get(), nullptr, 0, 0, read_width, read_height);
    468         if (is_24bit && g_settings.display_24bit_chroma_smoothing)
    469           ApplyChromaSmoothing();
    470       }
    471     }
    472   }
    473   else
    474   {
    475     if (CopyOut(0, 0, 0, VRAM_WIDTH, VRAM_HEIGHT, 0, false))
    476       SetDisplayTexture(m_upload_texture.get(), nullptr, 0, 0, VRAM_WIDTH, VRAM_HEIGHT);
    477   }
    478 }
    479 
    480 void GPU_SW::FillBackendCommandParameters(GPUBackendCommand* cmd) const
    481 {
    482   cmd->params.bits = 0;
    483   cmd->params.check_mask_before_draw = m_GPUSTAT.check_mask_before_draw;
    484   cmd->params.set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing;
    485   cmd->params.active_line_lsb = m_crtc_state.active_line_lsb;
    486   cmd->params.interlaced_rendering = IsInterlacedRenderingEnabled();
    487 }
    488 
    489 void GPU_SW::FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const
    490 {
    491   FillBackendCommandParameters(cmd);
    492   cmd->rc.bits = rc.bits;
    493   cmd->draw_mode.bits = m_draw_mode.mode_reg.bits;
    494   cmd->palette.bits = m_draw_mode.palette_reg.bits;
    495   cmd->window = m_draw_mode.texture_window;
    496 }
    497 
    498 void GPU_SW::DispatchRenderCommand()
    499 {
    500   if (m_drawing_area_changed)
    501   {
    502     GPUBackendSetDrawingAreaCommand* cmd = m_backend.NewSetDrawingAreaCommand();
    503     cmd->new_area = m_drawing_area;
    504     m_backend.PushCommand(cmd);
    505     m_drawing_area_changed = false;
    506   }
    507 
    508   const GPURenderCommand rc{m_render_command.bits};
    509 
    510   switch (rc.primitive)
    511   {
    512     case GPUPrimitive::Polygon:
    513     {
    514       const u32 num_vertices = rc.quad_polygon ? 4 : 3;
    515       GPUBackendDrawPolygonCommand* cmd = m_backend.NewDrawPolygonCommand(num_vertices);
    516       FillDrawCommand(cmd, rc);
    517 
    518       std::array<GSVector2i, 4> positions;
    519       const u32 first_color = rc.color_for_first_vertex;
    520       const bool shaded = rc.shading_enable;
    521       const bool textured = rc.texture_enable;
    522       for (u32 i = 0; i < num_vertices; i++)
    523       {
    524         GPUBackendDrawPolygonCommand::Vertex* vert = &cmd->vertices[i];
    525         vert->color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
    526         const u64 maddr_and_pos = m_fifo.Pop();
    527         const GPUVertexPosition vp{Truncate32(maddr_and_pos)};
    528         vert->x = m_drawing_offset.x + vp.x;
    529         vert->y = m_drawing_offset.y + vp.y;
    530         vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
    531         positions[i] = GSVector2i::load(&vert->x);
    532       }
    533 
    534       // Cull polygons which are too large.
    535       const GSVector2i min_pos_12 = positions[1].min_i32(positions[2]);
    536       const GSVector2i max_pos_12 = positions[1].max_i32(positions[2]);
    537       const GSVector4i draw_rect_012 = GSVector4i(min_pos_12.min_i32(positions[0]))
    538                                          .upl64(GSVector4i(max_pos_12.max_i32(positions[0])))
    539                                          .add32(GSVector4i::cxpr(0, 0, 1, 1));
    540       const bool first_tri_culled =
    541         (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH || draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT ||
    542          !m_clamped_drawing_area.rintersects(draw_rect_012));
    543       if (first_tri_culled)
    544       {
    545         DEBUG_LOG("Culling off-screen/too-large polygon: {},{} {},{} {},{}", cmd->vertices[0].x, cmd->vertices[0].y,
    546                   cmd->vertices[1].x, cmd->vertices[1].y, cmd->vertices[2].x, cmd->vertices[2].y);
    547 
    548         if (!rc.quad_polygon)
    549           return;
    550       }
    551       else
    552       {
    553         AddDrawTriangleTicks(positions[0], positions[1], positions[2], rc.shading_enable, rc.texture_enable,
    554                              rc.transparency_enable);
    555       }
    556 
    557       // quads
    558       if (rc.quad_polygon)
    559       {
    560         const GSVector4i draw_rect_123 = GSVector4i(min_pos_12.min_i32(positions[3]))
    561                                            .upl64(GSVector4i(max_pos_12.max_i32(positions[3])))
    562                                            .add32(GSVector4i::cxpr(0, 0, 1, 1));
    563 
    564         // Cull polygons which are too large.
    565         const bool second_tri_culled =
    566           (draw_rect_123.width() > MAX_PRIMITIVE_WIDTH || draw_rect_123.height() > MAX_PRIMITIVE_HEIGHT ||
    567            !m_clamped_drawing_area.rintersects(draw_rect_123));
    568         if (second_tri_culled)
    569         {
    570           DEBUG_LOG("Culling too-large polygon (quad second half): {},{} {},{} {},{}", cmd->vertices[2].x,
    571                     cmd->vertices[2].y, cmd->vertices[1].x, cmd->vertices[1].y, cmd->vertices[0].x, cmd->vertices[0].y);
    572 
    573           if (first_tri_culled)
    574             return;
    575         }
    576         else
    577         {
    578           AddDrawTriangleTicks(positions[2], positions[1], positions[3], rc.shading_enable, rc.texture_enable,
    579                                rc.transparency_enable);
    580         }
    581       }
    582 
    583       m_backend.PushCommand(cmd);
    584     }
    585     break;
    586 
    587     case GPUPrimitive::Rectangle:
    588     {
    589       GPUBackendDrawRectangleCommand* cmd = m_backend.NewDrawRectangleCommand();
    590       FillDrawCommand(cmd, rc);
    591       cmd->color = rc.color_for_first_vertex;
    592 
    593       const GPUVertexPosition vp{FifoPop()};
    594       cmd->x = TruncateGPUVertexPosition(m_drawing_offset.x + vp.x);
    595       cmd->y = TruncateGPUVertexPosition(m_drawing_offset.y + vp.y);
    596 
    597       if (rc.texture_enable)
    598       {
    599         const u32 texcoord_and_palette = FifoPop();
    600         cmd->palette.bits = Truncate16(texcoord_and_palette >> 16);
    601         cmd->texcoord = Truncate16(texcoord_and_palette);
    602       }
    603       else
    604       {
    605         cmd->palette.bits = 0;
    606         cmd->texcoord = 0;
    607       }
    608 
    609       switch (rc.rectangle_size)
    610       {
    611         case GPUDrawRectangleSize::R1x1:
    612           cmd->width = 1;
    613           cmd->height = 1;
    614           break;
    615         case GPUDrawRectangleSize::R8x8:
    616           cmd->width = 8;
    617           cmd->height = 8;
    618           break;
    619         case GPUDrawRectangleSize::R16x16:
    620           cmd->width = 16;
    621           cmd->height = 16;
    622           break;
    623         default:
    624         {
    625           const u32 width_and_height = FifoPop();
    626           cmd->width = static_cast<u16>(width_and_height & VRAM_WIDTH_MASK);
    627           cmd->height = static_cast<u16>((width_and_height >> 16) & VRAM_HEIGHT_MASK);
    628         }
    629         break;
    630       }
    631 
    632       const GSVector4i rect = GSVector4i(cmd->x, cmd->y, cmd->x + cmd->width, cmd->y + cmd->height);
    633       const GSVector4i clamped_rect = m_clamped_drawing_area.rintersect(rect);
    634       if (clamped_rect.rempty()) [[unlikely]]
    635       {
    636         DEBUG_LOG("Culling off-screen rectangle {}", rect);
    637         return;
    638       }
    639 
    640       AddDrawRectangleTicks(clamped_rect, rc.texture_enable, rc.transparency_enable);
    641 
    642       m_backend.PushCommand(cmd);
    643     }
    644     break;
    645 
    646     case GPUPrimitive::Line:
    647     {
    648       if (!rc.polyline)
    649       {
    650         GPUBackendDrawLineCommand* cmd = m_backend.NewDrawLineCommand(2);
    651         FillDrawCommand(cmd, rc);
    652         cmd->palette.bits = 0;
    653 
    654         if (rc.shading_enable)
    655         {
    656           cmd->vertices[0].color = rc.color_for_first_vertex;
    657           const GPUVertexPosition start_pos{FifoPop()};
    658           cmd->vertices[0].x = m_drawing_offset.x + start_pos.x;
    659           cmd->vertices[0].y = m_drawing_offset.y + start_pos.y;
    660 
    661           cmd->vertices[1].color = FifoPop() & UINT32_C(0x00FFFFFF);
    662           const GPUVertexPosition end_pos{FifoPop()};
    663           cmd->vertices[1].x = m_drawing_offset.x + end_pos.x;
    664           cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
    665         }
    666         else
    667         {
    668           cmd->vertices[0].color = rc.color_for_first_vertex;
    669           cmd->vertices[1].color = rc.color_for_first_vertex;
    670 
    671           const GPUVertexPosition start_pos{FifoPop()};
    672           cmd->vertices[0].x = m_drawing_offset.x + start_pos.x;
    673           cmd->vertices[0].y = m_drawing_offset.y + start_pos.y;
    674 
    675           const GPUVertexPosition end_pos{FifoPop()};
    676           cmd->vertices[1].x = m_drawing_offset.x + end_pos.x;
    677           cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
    678         }
    679 
    680         const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x);
    681         const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x);
    682         const GSVector4i rect = v0.min_i32(v1).xyxy(v0.max_i32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1));
    683         const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
    684 
    685         if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
    686         {
    687           DEBUG_LOG("Culling too-large/off-screen line: {},{} - {},{}", cmd->vertices[0].y, cmd->vertices[0].y,
    688                     cmd->vertices[1].x, cmd->vertices[1].y);
    689           return;
    690         }
    691 
    692         AddDrawLineTicks(clamped_rect, rc.shading_enable);
    693 
    694         m_backend.PushCommand(cmd);
    695       }
    696       else
    697       {
    698         const u32 num_vertices = GetPolyLineVertexCount();
    699 
    700         GPUBackendDrawLineCommand* cmd = m_backend.NewDrawLineCommand(num_vertices);
    701         FillDrawCommand(cmd, m_render_command);
    702 
    703         u32 buffer_pos = 0;
    704         const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
    705         cmd->vertices[0].x = start_vp.x + m_drawing_offset.x;
    706         cmd->vertices[0].y = start_vp.y + m_drawing_offset.y;
    707         cmd->vertices[0].color = m_render_command.color_for_first_vertex;
    708 
    709         const bool shaded = m_render_command.shading_enable;
    710         for (u32 i = 1; i < num_vertices; i++)
    711         {
    712           cmd->vertices[i].color =
    713             shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : m_render_command.color_for_first_vertex;
    714           const GPUVertexPosition vp{m_blit_buffer[buffer_pos++]};
    715           cmd->vertices[i].x = m_drawing_offset.x + vp.x;
    716           cmd->vertices[i].y = m_drawing_offset.y + vp.y;
    717 
    718           const GSVector4i v0 = GSVector4i::loadl(&cmd->vertices[0].x);
    719           const GSVector4i v1 = GSVector4i::loadl(&cmd->vertices[1].x);
    720           const GSVector4i rect = v0.min_i32(v1).xyxy(v0.max_i32(v1)).add32(GSVector4i::cxpr(0, 0, 1, 1));
    721           const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
    722 
    723           if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
    724           {
    725             DEBUG_LOG("Culling too-large/off-screen line: {},{} - {},{}", cmd->vertices[i - 1].x,
    726                       cmd->vertices[i - 1].y, cmd->vertices[i].x, cmd->vertices[i].y);
    727             return;
    728           }
    729           else
    730           {
    731             AddDrawLineTicks(clamped_rect, rc.shading_enable);
    732           }
    733         }
    734 
    735         m_backend.PushCommand(cmd);
    736       }
    737     }
    738     break;
    739 
    740     default:
    741       UnreachableCode();
    742       break;
    743   }
    744 }
    745 
    746 void GPU_SW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
    747 {
    748   m_backend.Sync(false);
    749 }
    750 
    751 void GPU_SW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
    752 {
    753   GPUBackendFillVRAMCommand* cmd = m_backend.NewFillVRAMCommand();
    754   FillBackendCommandParameters(cmd);
    755   cmd->x = static_cast<u16>(x);
    756   cmd->y = static_cast<u16>(y);
    757   cmd->width = static_cast<u16>(width);
    758   cmd->height = static_cast<u16>(height);
    759   cmd->color = color;
    760   m_backend.PushCommand(cmd);
    761 }
    762 
    763 void GPU_SW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
    764 {
    765   const u32 num_words = width * height;
    766   GPUBackendUpdateVRAMCommand* cmd = m_backend.NewUpdateVRAMCommand(num_words);
    767   FillBackendCommandParameters(cmd);
    768   cmd->params.set_mask_while_drawing = set_mask;
    769   cmd->params.check_mask_before_draw = check_mask;
    770   cmd->x = static_cast<u16>(x);
    771   cmd->y = static_cast<u16>(y);
    772   cmd->width = static_cast<u16>(width);
    773   cmd->height = static_cast<u16>(height);
    774   std::memcpy(cmd->data, data, sizeof(u16) * num_words);
    775   m_backend.PushCommand(cmd);
    776 }
    777 
    778 void GPU_SW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
    779 {
    780   GPUBackendCopyVRAMCommand* cmd = m_backend.NewCopyVRAMCommand();
    781   FillBackendCommandParameters(cmd);
    782   cmd->src_x = static_cast<u16>(src_x);
    783   cmd->src_y = static_cast<u16>(src_y);
    784   cmd->dst_x = static_cast<u16>(dst_x);
    785   cmd->dst_y = static_cast<u16>(dst_y);
    786   cmd->width = static_cast<u16>(width);
    787   cmd->height = static_cast<u16>(height);
    788   m_backend.PushCommand(cmd);
    789 }
    790 
    791 void GPU_SW::FlushRender()
    792 {
    793 }
    794 
    795 void GPU_SW::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
    796 {
    797   GPUBackendUpdateCLUTCommand* cmd = m_backend.NewUpdateCLUTCommand();
    798   FillBackendCommandParameters(cmd);
    799   cmd->reg.bits = reg.bits;
    800   cmd->clut_is_8bit = clut_is_8bit;
    801   m_backend.PushCommand(cmd);
    802 }
    803 
    804 std::unique_ptr<GPU> GPU::CreateSoftwareRenderer()
    805 {
    806   std::unique_ptr<GPU_SW> gpu(std::make_unique<GPU_SW>());
    807   if (!gpu->Initialize())
    808     return nullptr;
    809 
    810   return gpu;
    811 }