duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

gpu_hw.cpp (162574B)


      1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "gpu_hw.h"
      5 #include "cpu_core.h"
      6 #include "cpu_pgxp.h"
      7 #include "gpu_hw_shadergen.h"
      8 #include "gpu_sw_backend.h"
      9 #include "host.h"
     10 #include "settings.h"
     11 #include "system.h"
     12 
     13 #include "util/imgui_manager.h"
     14 #include "util/postprocessing.h"
     15 #include "util/state_wrapper.h"
     16 
     17 #include "common/align.h"
     18 #include "common/assert.h"
     19 #include "common/error.h"
     20 #include "common/gsvector_formatter.h"
     21 #include "common/log.h"
     22 #include "common/scoped_guard.h"
     23 #include "common/string_util.h"
     24 #include "common/timer.h"
     25 
     26 #include "IconsFontAwesome5.h"
     27 #include "IconsEmoji.h"
     28 #include "imgui.h"
     29 
     30 #include <cmath>
     31 #include <limits>
     32 #include <sstream>
     33 #include <tuple>
     34 
     35 Log_SetChannel(GPU_HW);
     36 
     37 // TODO: instead of full state restore, only restore what changed
     38 
     39 static constexpr GPUTexture::Format VRAM_RT_FORMAT = GPUTexture::Format::RGBA8;
     40 static constexpr GPUTexture::Format VRAM_DS_FORMAT = GPUTexture::Format::D16;
     41 static constexpr GPUTexture::Format VRAM_DS_DEPTH_FORMAT = GPUTexture::Format::D32F;
     42 static constexpr GPUTexture::Format VRAM_DS_COLOR_FORMAT = GPUTexture::Format::R32F;
     43 
     44 #ifdef _DEBUG
     45 
     46 static u32 s_draw_number = 0;
     47 
     48 static constexpr const std::array s_transparency_modes = {
     49   "HalfBackgroundPlusHalfForeground",
     50   "BackgroundPlusForeground",
     51   "BackgroundMinusForeground",
     52   "BackgroundPlusQuarterForeground",
     53   "Disabled",
     54 };
     55 
     56 static constexpr const std::array s_batch_texture_modes = {
     57   "Palette4Bit",       "Palette8Bit",       "Direct16Bit",       "Disabled",
     58   "SpritePalette4Bit", "SpritePalette8Bit", "SpriteDirect16Bit",
     59 };
     60 
     61 static constexpr const std::array s_batch_render_modes = {
     62   "TransparencyDisabled", "TransparentAndOpaque", "OnlyOpaque", "OnlyTransparent", "ShaderBlend",
     63 };
     64 
     65 #endif
     66 
     67 /// Returns the distance between two rectangles.
     68 ALWAYS_INLINE static float RectDistance(const GSVector4i lhs, const GSVector4i rhs)
     69 {
     70   const s32 lcx = (lhs.left + ((lhs.right - lhs.left) / 2));
     71   const s32 lcy = (lhs.top + ((lhs.bottom - lhs.top) / 2));
     72   const s32 rcx = (rhs.left + ((rhs.right - rhs.left) / 2));
     73   const s32 rcy = (rhs.top + ((rhs.bottom - rhs.top) / 2));
     74   const s32 dx = (lcx - rcx);
     75   const s32 dy = (lcy - rcy);
     76   const s32 distsq = (dx * dx) + (dy * dy);
     77   return std::sqrt(static_cast<float>(distsq));
     78 }
     79 
     80 ALWAYS_INLINE static u32 GetMaxResolutionScale()
     81 {
     82   return g_gpu_device->GetMaxTextureSize() / VRAM_WIDTH;
     83 }
     84 
     85 ALWAYS_INLINE_RELEASE static u32 GetBoxDownsampleScale(u32 resolution_scale)
     86 {
     87   u32 scale = std::min<u32>(resolution_scale, g_settings.gpu_downsample_scale);
     88   while ((resolution_scale % scale) != 0)
     89     scale--;
     90   return scale;
     91 }
     92 
     93 ALWAYS_INLINE static bool ShouldClampUVs(GPUTextureFilter texture_filter)
     94 {
     95   // We only need UV limits if PGXP is enabled, or texture filtering is enabled.
     96   return g_settings.gpu_pgxp_enable || texture_filter != GPUTextureFilter::Nearest;
     97 }
     98 
     99 ALWAYS_INLINE static bool ShouldAllowSpriteMode(u8 resolution_scale, GPUTextureFilter texture_filter,
    100                                                 GPUTextureFilter sprite_texture_filter)
    101 {
    102   // Use sprite shaders/mode when texcoord rounding is forced, or if the filters are different.
    103   return (sprite_texture_filter != texture_filter || (resolution_scale > 1 && g_settings.gpu_force_round_texcoords));
    104 }
    105 
    106 ALWAYS_INLINE static bool ShouldDisableColorPerspective()
    107 {
    108   return g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_texture_correction && !g_settings.gpu_pgxp_color_correction;
    109 }
    110 
    111 /// Returns true if the specified texture filtering mode requires dual-source blending.
    112 ALWAYS_INLINE static bool IsBlendedTextureFiltering(GPUTextureFilter filter)
    113 {
    114   // return (filter == GPUTextureFilter::Bilinear || filter == GPUTextureFilter::JINC2 || filter ==
    115   // GPUTextureFilter::xBR);
    116   static_assert(((static_cast<u8>(GPUTextureFilter::Nearest) & 1u) == 0u) &&
    117                 ((static_cast<u8>(GPUTextureFilter::Bilinear) & 1u) == 1u) &&
    118                 ((static_cast<u8>(GPUTextureFilter::BilinearBinAlpha) & 1u) == 0u) &&
    119                 ((static_cast<u8>(GPUTextureFilter::JINC2) & 1u) == 1u) &&
    120                 ((static_cast<u8>(GPUTextureFilter::JINC2BinAlpha) & 1u) == 0u) &&
    121                 ((static_cast<u8>(GPUTextureFilter::xBR) & 1u) == 1u) &&
    122                 ((static_cast<u8>(GPUTextureFilter::xBRBinAlpha) & 1u) == 0u));
    123   return ((static_cast<u8>(filter) & 1u) == 1u);
    124 }
    125 
    126 /// Computes the area affected by a VRAM transfer, including wrap-around of X.
    127 ALWAYS_INLINE_RELEASE static GSVector4i GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height)
    128 {
    129   GSVector4i ret;
    130   ret.left = x % VRAM_WIDTH;
    131   ret.top = y % VRAM_HEIGHT;
    132   ret.right = ret.left + width;
    133   ret.bottom = ret.top + height;
    134   if (ret.right > static_cast<s32>(VRAM_WIDTH))
    135   {
    136     ret.left = 0;
    137     ret.right = static_cast<s32>(VRAM_WIDTH);
    138   }
    139   if (ret.bottom > static_cast<s32>(VRAM_HEIGHT))
    140   {
    141     ret.top = 0;
    142     ret.bottom = static_cast<s32>(VRAM_HEIGHT);
    143   }
    144   return ret;
    145 }
    146 
    147 namespace {
    148 class ShaderCompileProgressTracker
    149 {
    150 public:
    151   ShaderCompileProgressTracker(std::string title, u32 total)
    152     : m_title(std::move(title)), m_min_time(Common::Timer::ConvertSecondsToValue(1.0)),
    153       m_update_interval(Common::Timer::ConvertSecondsToValue(0.1)), m_start_time(Common::Timer::GetCurrentValue()),
    154       m_last_update_time(0), m_progress(0), m_total(total)
    155   {
    156   }
    157   ~ShaderCompileProgressTracker() = default;
    158 
    159   void Increment(u32 progress = 1)
    160   {
    161     m_progress += progress;
    162 
    163     const u64 tv = Common::Timer::GetCurrentValue();
    164     if ((tv - m_start_time) >= m_min_time && (tv - m_last_update_time) >= m_update_interval)
    165     {
    166       Host::DisplayLoadingScreen(m_title.c_str(), 0, static_cast<int>(m_total), static_cast<int>(m_progress));
    167       m_last_update_time = tv;
    168     }
    169   }
    170 
    171 private:
    172   std::string m_title;
    173   u64 m_min_time;
    174   u64 m_update_interval;
    175   u64 m_start_time;
    176   u64 m_last_update_time;
    177   u32 m_progress;
    178   u32 m_total;
    179 };
    180 } // namespace
    181 
    182 GPU_HW::GPU_HW() : GPU()
    183 {
    184 #ifdef _DEBUG
    185   s_draw_number = 0;
    186 #endif
    187 }
    188 
    189 GPU_HW::~GPU_HW()
    190 {
    191   if (m_sw_renderer)
    192   {
    193     m_sw_renderer->Shutdown();
    194     m_sw_renderer.reset();
    195   }
    196 }
    197 
    198 ALWAYS_INLINE void GPU_HW::BatchVertex::Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_,
    199                                             u16 packed_texcoord, u32 uv_limits_)
    200 {
    201   Set(x_, y_, z_, w_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8), uv_limits_);
    202 }
    203 
    204 ALWAYS_INLINE void GPU_HW::BatchVertex::Set(float x_, float y_, float z_, float w_, u32 color_, u32 texpage_, u16 u_,
    205                                             u16 v_, u32 uv_limits_)
    206 {
    207   x = x_;
    208   y = y_;
    209   z = z_;
    210   w = w_;
    211   color = color_;
    212   texpage = texpage_;
    213   u = u_;
    214   v = v_;
    215   uv_limits = uv_limits_;
    216 }
    217 
    218 ALWAYS_INLINE u32 GPU_HW::BatchVertex::PackUVLimits(u32 min_u, u32 max_u, u32 min_v, u32 max_v)
    219 {
    220   return min_u | (min_v << 8) | (max_u << 16) | (max_v << 24);
    221 }
    222 
    223 ALWAYS_INLINE void GPU_HW::BatchVertex::SetUVLimits(u32 min_u, u32 max_u, u32 min_v, u32 max_v)
    224 {
    225   uv_limits = PackUVLimits(min_u, max_u, min_v, max_v);
    226 }
    227 
    228 const Threading::Thread* GPU_HW::GetSWThread() const
    229 {
    230   return m_sw_renderer ? m_sw_renderer->GetThread() : nullptr;
    231 }
    232 
    233 bool GPU_HW::IsHardwareRenderer() const
    234 {
    235   return true;
    236 }
    237 
    238 bool GPU_HW::Initialize()
    239 {
    240   if (!GPU::Initialize())
    241     return false;
    242 
    243   const GPUDevice::Features features = g_gpu_device->GetFeatures();
    244 
    245   m_resolution_scale = Truncate8(CalculateResolutionScale());
    246   m_multisamples = Truncate8(std::min<u32>(g_settings.gpu_multisamples, g_gpu_device->GetMaxMultisamples()));
    247   m_texture_filtering = g_settings.gpu_texture_filter;
    248   m_sprite_texture_filtering = g_settings.gpu_sprite_texture_filter;
    249   m_line_detect_mode = (m_resolution_scale > 1) ? g_settings.gpu_line_detect_mode : GPULineDetectMode::Disabled;
    250   m_downsample_mode = GetDownsampleMode(m_resolution_scale);
    251   m_wireframe_mode = g_settings.gpu_wireframe_mode;
    252   m_supports_dual_source_blend = features.dual_source_blend;
    253   m_supports_framebuffer_fetch = features.framebuffer_fetch;
    254   m_true_color = g_settings.gpu_true_color;
    255   m_pgxp_depth_buffer = g_settings.UsingPGXPDepthBuffer();
    256   m_clamp_uvs = ShouldClampUVs(m_texture_filtering) || ShouldClampUVs(m_sprite_texture_filtering);
    257   m_compute_uv_range = m_clamp_uvs;
    258   m_allow_sprite_mode = ShouldAllowSpriteMode(m_resolution_scale, m_texture_filtering, m_sprite_texture_filtering);
    259 
    260   CheckSettings();
    261 
    262   UpdateSoftwareRenderer(false);
    263 
    264   PrintSettingsToLog();
    265 
    266   Error error;
    267   if (!CompilePipelines(&error))
    268   {
    269     ERROR_LOG("Failed to compile pipelines: {}", error.GetDescription());
    270     return false;
    271   }
    272 
    273   if (!CreateBuffers())
    274   {
    275     ERROR_LOG("Failed to create framebuffer");
    276     return false;
    277   }
    278 
    279   UpdateDownsamplingLevels();
    280   RestoreDeviceContext();
    281   return true;
    282 }
    283 
    284 void GPU_HW::Reset(bool clear_vram)
    285 {
    286   if (m_batch_vertex_ptr)
    287     UnmapGPUBuffer(0, 0);
    288 
    289   GPU::Reset(clear_vram);
    290 
    291   if (m_sw_renderer)
    292     m_sw_renderer->Reset();
    293 
    294   m_batch = {};
    295   m_batch_ubo_data = {};
    296   m_batch_ubo_dirty = true;
    297   m_current_depth = 1;
    298   SetClampedDrawingArea();
    299 
    300   if (clear_vram)
    301     ClearFramebuffer();
    302 }
    303 
    304 bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display)
    305 {
    306   // Need to download local VRAM copy before calling the base class, because it serializes this.
    307   if (m_sw_renderer)
    308   {
    309     m_sw_renderer->Sync(true);
    310   }
    311   else if (sw.IsWriting() && !host_texture)
    312   {
    313     // If SW renderer readbacks aren't enabled, the CLUT won't be populated, which means it'll be invalid if the user
    314     // loads this state with software instead of hardware renderers. So force-update the CLUT.
    315     ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
    316     if (IsCLUTValid())
    317       GPU::ReadCLUT(g_gpu_clut, GPUTexturePaletteReg{Truncate16(m_current_clut_reg_bits)}, m_current_clut_is_8bit);
    318   }
    319 
    320   if (!GPU::DoState(sw, host_texture, update_display))
    321     return false;
    322 
    323   if (host_texture)
    324   {
    325     GPUTexture* tex = *host_texture;
    326     if (sw.IsReading())
    327     {
    328       if (tex->GetWidth() != m_vram_texture->GetWidth() || tex->GetHeight() != m_vram_texture->GetHeight() ||
    329           tex->GetSamples() != m_vram_texture->GetSamples())
    330       {
    331         return false;
    332       }
    333 
    334       g_gpu_device->CopyTextureRegion(m_vram_texture.get(), 0, 0, 0, 0, tex, 0, 0, 0, 0, tex->GetWidth(),
    335                                       tex->GetHeight());
    336     }
    337     else
    338     {
    339       if (!tex || tex->GetWidth() != m_vram_texture->GetWidth() || tex->GetHeight() != m_vram_texture->GetHeight() ||
    340           tex->GetSamples() != m_vram_texture->GetSamples())
    341       {
    342         delete tex;
    343 
    344         // We copy to/from the save state texture, but we can't have multisampled non-RTs.
    345         tex = g_gpu_device
    346                 ->FetchTexture(
    347                   m_vram_texture->GetWidth(), m_vram_texture->GetHeight(), 1, 1, m_vram_texture->GetSamples(),
    348                   m_vram_texture->IsMultisampled() ? GPUTexture::Type::RenderTarget : GPUTexture::Type::Texture,
    349                   GPUTexture::Format::RGBA8, nullptr, 0)
    350                 .release();
    351         *host_texture = tex;
    352         if (!tex)
    353           return false;
    354       }
    355 
    356       g_gpu_device->CopyTextureRegion(tex, 0, 0, 0, 0, m_vram_texture.get(), 0, 0, 0, 0, tex->GetWidth(),
    357                                       tex->GetHeight());
    358     }
    359   }
    360   else if (sw.IsReading())
    361   {
    362     // Need to update the VRAM copy on the GPU with the state data.
    363     UpdateVRAMOnGPU(0, 0, VRAM_WIDTH, VRAM_HEIGHT, g_vram, VRAM_WIDTH * sizeof(u16), false, false, VRAM_SIZE_RECT);
    364   }
    365 
    366   // invalidate the whole VRAM read texture when loading state
    367   if (sw.IsReading())
    368   {
    369     DebugAssert(!m_batch_vertex_ptr && !m_batch_index_ptr);
    370     ClearVRAMDirtyRectangle();
    371     SetFullVRAMDirtyRectangle();
    372     ResetBatchVertexDepth();
    373   }
    374 
    375   return true;
    376 }
    377 
    378 void GPU_HW::RestoreDeviceContext()
    379 {
    380   g_gpu_device->SetTextureSampler(0, m_vram_read_texture.get(), g_gpu_device->GetNearestSampler());
    381   SetVRAMRenderTarget();
    382   g_gpu_device->SetViewport(m_vram_texture->GetRect());
    383   SetScissor();
    384   m_batch_ubo_dirty = true;
    385 }
    386 
    387 void GPU_HW::UpdateSettings(const Settings& old_settings)
    388 {
    389   GPU::UpdateSettings(old_settings);
    390 
    391   const GPUDevice::Features features = g_gpu_device->GetFeatures();
    392 
    393   const u8 resolution_scale = Truncate8(CalculateResolutionScale());
    394   const u8 multisamples = Truncate8(std::min<u32>(g_settings.gpu_multisamples, g_gpu_device->GetMaxMultisamples()));
    395   const bool clamp_uvs = ShouldClampUVs(m_texture_filtering) || ShouldClampUVs(m_sprite_texture_filtering);
    396   const bool framebuffer_changed = (m_resolution_scale != resolution_scale || m_multisamples != multisamples ||
    397                                     g_settings.IsUsingAccurateBlending() != old_settings.IsUsingAccurateBlending() ||
    398                                     m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer());
    399   const bool shaders_changed =
    400     (m_resolution_scale != resolution_scale || m_multisamples != multisamples ||
    401      m_true_color != g_settings.gpu_true_color || g_settings.gpu_debanding != old_settings.gpu_debanding ||
    402      (multisamples > 0 && g_settings.gpu_per_sample_shading != old_settings.gpu_per_sample_shading) ||
    403      (resolution_scale > 1 && g_settings.gpu_scaled_dithering != old_settings.gpu_scaled_dithering) ||
    404      (resolution_scale > 1 && g_settings.gpu_texture_filter == GPUTextureFilter::Nearest &&
    405       g_settings.gpu_force_round_texcoords != old_settings.gpu_force_round_texcoords) ||
    406      g_settings.IsUsingAccurateBlending() != old_settings.IsUsingAccurateBlending() ||
    407      m_texture_filtering != g_settings.gpu_texture_filter ||
    408      m_sprite_texture_filtering != g_settings.gpu_sprite_texture_filter || m_clamp_uvs != clamp_uvs ||
    409      (resolution_scale > 1 && (g_settings.gpu_downsample_mode != old_settings.gpu_downsample_mode ||
    410                                (m_downsample_mode == GPUDownsampleMode::Box &&
    411                                 g_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale))) ||
    412      (features.geometry_shaders && g_settings.gpu_wireframe_mode != old_settings.gpu_wireframe_mode) ||
    413      m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer() ||
    414      (features.noperspective_interpolation &&
    415       ShouldDisableColorPerspective() != old_settings.gpu_pgxp_color_correction) ||
    416      m_allow_sprite_mode !=
    417        ShouldAllowSpriteMode(m_resolution_scale, g_settings.gpu_texture_filter, g_settings.gpu_sprite_texture_filter));
    418 
    419   if (m_resolution_scale != resolution_scale)
    420   {
    421     Host::AddIconOSDMessage(
    422       "ResolutionScaleChanged", ICON_FA_PAINT_BRUSH,
    423       fmt::format(TRANSLATE_FS("GPU_HW", "Resolution scale set to {0}x (display {1}x{2}, VRAM {3}x{4})"),
    424                   resolution_scale, m_crtc_state.display_vram_width * resolution_scale,
    425                   resolution_scale * m_crtc_state.display_vram_height, VRAM_WIDTH * resolution_scale,
    426                   VRAM_HEIGHT * resolution_scale),
    427       Host::OSD_INFO_DURATION);
    428   }
    429 
    430   if (m_multisamples != multisamples || g_settings.gpu_per_sample_shading != old_settings.gpu_per_sample_shading)
    431   {
    432     if (g_settings.gpu_per_sample_shading && features.per_sample_shading)
    433     {
    434       Host::AddIconOSDMessage(
    435         "MultisamplingChanged", ICON_FA_PAINT_BRUSH,
    436         fmt::format(TRANSLATE_FS("GPU_HW", "Multisample anti-aliasing set to {}x (SSAA)."), multisamples),
    437         Host::OSD_INFO_DURATION);
    438     }
    439     else
    440     {
    441       Host::AddIconOSDMessage(
    442         "MultisamplingChanged", ICON_FA_PAINT_BRUSH,
    443         fmt::format(TRANSLATE_FS("GPU_HW", "Multisample anti-aliasing set to {}x."), multisamples),
    444         Host::OSD_INFO_DURATION);
    445     }
    446   }
    447 
    448   // Back up VRAM if we're recreating the framebuffer.
    449   if (framebuffer_changed)
    450   {
    451     RestoreDeviceContext();
    452     ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
    453     DestroyBuffers();
    454   }
    455 
    456   m_resolution_scale = resolution_scale;
    457   m_multisamples = multisamples;
    458   m_texture_filtering = g_settings.gpu_texture_filter;
    459   m_sprite_texture_filtering = g_settings.gpu_sprite_texture_filter;
    460   m_line_detect_mode = (m_resolution_scale > 1) ? g_settings.gpu_line_detect_mode : GPULineDetectMode::Disabled;
    461   m_downsample_mode = GetDownsampleMode(resolution_scale);
    462   m_wireframe_mode = g_settings.gpu_wireframe_mode;
    463   m_true_color = g_settings.gpu_true_color;
    464   m_clamp_uvs = clamp_uvs;
    465   m_compute_uv_range = m_clamp_uvs;
    466   m_allow_sprite_mode = ShouldAllowSpriteMode(resolution_scale, m_texture_filtering, m_sprite_texture_filtering);
    467   m_batch.sprite_mode = (m_allow_sprite_mode && m_batch.sprite_mode);
    468 
    469   const bool depth_buffer_changed = (m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer());
    470   if (depth_buffer_changed)
    471   {
    472     m_pgxp_depth_buffer = g_settings.UsingPGXPDepthBuffer();
    473     m_batch.use_depth_buffer = false;
    474     m_depth_was_copied = false;
    475   }
    476 
    477   CheckSettings();
    478 
    479   UpdateSoftwareRenderer(true);
    480 
    481   PrintSettingsToLog();
    482 
    483   if (shaders_changed)
    484   {
    485     DestroyPipelines();
    486 
    487     Error error;
    488     if (!CompilePipelines(&error))
    489     {
    490       ERROR_LOG("Failed to recompile pipelines: {}", error.GetDescription());
    491       Panic("Failed to recompile pipelines.");
    492     }
    493   }
    494 
    495   if (framebuffer_changed)
    496   {
    497     // When using very high upscaling, it's possible that we don't have enough VRAM for two sets of buffers.
    498     // Purge the pool, and idle the GPU so that all video memory is freed prior to creating the new buffers.
    499     g_gpu_device->PurgeTexturePool();
    500     g_gpu_device->ExecuteAndWaitForGPUIdle();
    501 
    502     if (!CreateBuffers())
    503       Panic("Failed to recreate buffers.");
    504 
    505     UpdateDownsamplingLevels();
    506     RestoreDeviceContext();
    507     UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, g_vram, false, false);
    508     if (m_write_mask_as_depth)
    509       UpdateDepthBufferFromMaskBit();
    510     UpdateDisplay();
    511   }
    512   else if (m_vram_depth_texture && depth_buffer_changed)
    513   {
    514     if (m_pgxp_depth_buffer)
    515       ClearDepthBuffer();
    516     else if (m_write_mask_as_depth)
    517       UpdateDepthBufferFromMaskBit();
    518   }
    519 
    520   if (g_settings.gpu_downsample_mode != old_settings.gpu_downsample_mode ||
    521       (g_settings.gpu_downsample_mode == GPUDownsampleMode::Box &&
    522        g_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale))
    523   {
    524     UpdateDownsamplingLevels();
    525   }
    526 }
    527 
    528 void GPU_HW::CheckSettings()
    529 {
    530   const GPUDevice::Features features = g_gpu_device->GetFeatures();
    531 
    532   if (m_multisamples != g_settings.gpu_multisamples)
    533   {
    534     Host::AddIconOSDMessage("MSAAUnsupported", ICON_EMOJI_WARNING,
    535                             fmt::format(TRANSLATE_FS("GPU_HW", "{}x MSAA is not supported, using {}x instead."),
    536                                         g_settings.gpu_multisamples, m_multisamples),
    537                             Host::OSD_CRITICAL_ERROR_DURATION);
    538   }
    539   else
    540   {
    541     Host::RemoveKeyedOSDMessage("MSAAUnsupported");
    542   }
    543 
    544   if (g_settings.gpu_per_sample_shading && !features.per_sample_shading)
    545   {
    546     Host::AddIconOSDMessage("SSAAUnsupported", ICON_EMOJI_WARNING,
    547                             TRANSLATE_STR("GPU_HW", "SSAA is not supported, using MSAA instead."),
    548                             Host::OSD_ERROR_DURATION);
    549   }
    550   if (!features.dual_source_blend && !features.framebuffer_fetch &&
    551       (IsBlendedTextureFiltering(m_texture_filtering) || IsBlendedTextureFiltering(m_sprite_texture_filtering)))
    552   {
    553     Host::AddIconOSDMessage(
    554       "TextureFilterUnsupported", ICON_EMOJI_WARNING,
    555       fmt::format(TRANSLATE_FS("GPU_HW", "Texture filter '{}/{}' is not supported with the current renderer."),
    556                   Settings::GetTextureFilterDisplayName(m_texture_filtering),
    557                   Settings::GetTextureFilterName(m_sprite_texture_filtering), Host::OSD_ERROR_DURATION));
    558     m_texture_filtering = GPUTextureFilter::Nearest;
    559     m_sprite_texture_filtering = GPUTextureFilter::Nearest;
    560     m_allow_sprite_mode = ShouldAllowSpriteMode(m_resolution_scale, m_texture_filtering, m_sprite_texture_filtering);
    561   }
    562 
    563   if (g_settings.IsUsingAccurateBlending() && !m_supports_framebuffer_fetch && !features.feedback_loops &&
    564       !features.raster_order_views)
    565   {
    566     // m_allow_shader_blend/m_prefer_shader_blend will be cleared in pipeline compile.
    567     Host::AddIconOSDMessage(
    568       "AccurateBlendingUnsupported", ICON_EMOJI_WARNING,
    569       TRANSLATE_STR("GPU_HW", "Accurate blending is not supported by your current GPU.\nIt requires framebuffer fetch, "
    570                               "feedback loops, or rasterizer order views."),
    571       Host::OSD_WARNING_DURATION);
    572   }
    573   else if (IsUsingMultisampling() && !features.framebuffer_fetch &&
    574            ((g_settings.IsUsingAccurateBlending() && features.raster_order_views) ||
    575             (m_pgxp_depth_buffer && features.raster_order_views && !features.feedback_loops)))
    576   {
    577     Host::AddIconOSDMessage(
    578       "AccurateBlendingUnsupported", ICON_EMOJI_WARNING,
    579       TRANSLATE_STR("GPU_HW", "Multisample anti-aliasing is not supported when using ROV blending."),
    580       Host::OSD_WARNING_DURATION);
    581     m_multisamples = 1;
    582   }
    583 
    584   if (m_pgxp_depth_buffer && !features.feedback_loops && !features.framebuffer_fetch && !features.raster_order_views)
    585   {
    586     Host::AddIconOSDMessage(
    587       "AccurateBlendingUnsupported", ICON_EMOJI_WARNING,
    588       TRANSLATE_STR("GPU_HW", "PGXP depth buffer is not supported by your current GPU or renderer.\nIt requires "
    589                               "framebuffer fetch, feedback loops, or rasterizer order views."),
    590       Host::OSD_WARNING_DURATION);
    591     m_pgxp_depth_buffer = false;
    592   }
    593 
    594   if (!features.noperspective_interpolation && !ShouldDisableColorPerspective())
    595     WARNING_LOG("Disable color perspective not supported, but should be used.");
    596 
    597   if (!features.geometry_shaders && m_wireframe_mode != GPUWireframeMode::Disabled)
    598   {
    599     Host::AddIconOSDMessage(
    600       "GeometryShadersUnsupported", ICON_EMOJI_WARNING,
    601       TRANSLATE("GPU_HW", "Geometry shaders are not supported by your GPU, and are required for wireframe rendering."),
    602       Host::OSD_CRITICAL_ERROR_DURATION);
    603     m_wireframe_mode = GPUWireframeMode::Disabled;
    604   }
    605 
    606   if (m_downsample_mode == GPUDownsampleMode::Box)
    607   {
    608     const u32 resolution_scale = CalculateResolutionScale();
    609     const u32 box_downscale = GetBoxDownsampleScale(resolution_scale);
    610     if (box_downscale != g_settings.gpu_downsample_scale || box_downscale == resolution_scale)
    611     {
    612       Host::AddIconOSDMessage(
    613         "BoxDownsampleUnsupported", ICON_FA_PAINT_BRUSH,
    614         fmt::format(TRANSLATE_FS(
    615                       "GPU_HW", "Resolution scale {0}x is not divisible by downsample scale {1}x, using {2}x instead."),
    616                     resolution_scale, g_settings.gpu_downsample_scale, box_downscale),
    617         Host::OSD_WARNING_DURATION);
    618     }
    619     else
    620     {
    621       Host::RemoveKeyedOSDMessage("BoxDownsampleUnsupported");
    622     }
    623 
    624     if (box_downscale == g_settings.gpu_resolution_scale)
    625       m_downsample_mode = GPUDownsampleMode::Disabled;
    626   }
    627 }
    628 
    629 u32 GPU_HW::CalculateResolutionScale() const
    630 {
    631   const u32 max_resolution_scale = GetMaxResolutionScale();
    632 
    633   u32 scale;
    634   if (g_settings.gpu_resolution_scale != 0)
    635   {
    636     scale = std::clamp<u32>(g_settings.gpu_resolution_scale, 1, max_resolution_scale);
    637   }
    638   else
    639   {
    640     // Auto scaling. When the system is starting and all borders crop is enabled, the registers are zero, and
    641     // display_height therefore is also zero. Use the default size from the region in this case.
    642     const s32 height = (m_crtc_state.display_height != 0) ?
    643                          static_cast<s32>(m_crtc_state.display_height) :
    644                          (m_console_is_pal ? (PAL_VERTICAL_ACTIVE_END - PAL_VERTICAL_ACTIVE_START) :
    645                                              (NTSC_VERTICAL_ACTIVE_END - NTSC_VERTICAL_ACTIVE_START));
    646 
    647     float widescreen_multiplier = 1.0f;
    648     if (g_settings.gpu_widescreen_hack)
    649     {
    650       // Multiply scale factor by aspect ratio relative to 4:3, so that widescreen resolution is as close as possible to
    651       // native screen resolution. Otherwise, anamorphic stretching would result in increasingly less horizontal
    652       // resolution (relative to native screen resolution) as the aspect ratio gets wider.
    653       widescreen_multiplier = std::max(1.0f, (static_cast<float>(g_gpu_device->GetWindowWidth()) /
    654                                               static_cast<float>(g_gpu_device->GetWindowHeight())) /
    655                                                (4.0f / 3.0f));
    656     }
    657 
    658     const s32 preferred_scale =
    659       static_cast<s32>(std::ceil(static_cast<float>(g_gpu_device->GetWindowHeight() * widescreen_multiplier) / height));
    660     VERBOSE_LOG("Height = {}, preferred scale = {}", height, preferred_scale);
    661 
    662     scale = static_cast<u32>(std::clamp<s32>(preferred_scale, 1, max_resolution_scale));
    663   }
    664 
    665   if (g_settings.gpu_downsample_mode == GPUDownsampleMode::Adaptive && scale > 1 && !Common::IsPow2(scale))
    666   {
    667     const u32 new_scale = Common::PreviousPow2(scale);
    668     WARNING_LOG("Resolution scale {}x not supported for adaptive downsampling, using {}x", scale, new_scale);
    669 
    670     if (g_settings.gpu_resolution_scale != 0)
    671     {
    672       Host::AddIconOSDMessage(
    673         "ResolutionNotPow2", ICON_FA_PAINT_BRUSH,
    674         fmt::format(
    675           TRANSLATE_FS("GPU_HW", "Resolution scale {0}x not supported for adaptive downsampling, using {1}x."), scale,
    676           new_scale),
    677         Host::OSD_WARNING_DURATION);
    678     }
    679 
    680     scale = new_scale;
    681   }
    682 
    683   return scale;
    684 }
    685 
    686 void GPU_HW::UpdateResolutionScale()
    687 {
    688   GPU::UpdateResolutionScale();
    689 
    690   if (CalculateResolutionScale() != m_resolution_scale)
    691     UpdateSettings(g_settings);
    692 }
    693 
    694 GPUDownsampleMode GPU_HW::GetDownsampleMode(u32 resolution_scale) const
    695 {
    696   return (resolution_scale == 1) ? GPUDownsampleMode::Disabled : g_settings.gpu_downsample_mode;
    697 }
    698 
    699 bool GPU_HW::IsUsingMultisampling() const
    700 {
    701   return m_multisamples > 1;
    702 }
    703 
    704 bool GPU_HW::IsUsingDownsampling() const
    705 {
    706   return (m_downsample_mode != GPUDownsampleMode::Disabled && !m_GPUSTAT.display_area_color_depth_24);
    707 }
    708 
    709 void GPU_HW::SetFullVRAMDirtyRectangle()
    710 {
    711   m_vram_dirty_draw_rect = VRAM_SIZE_RECT;
    712   m_draw_mode.SetTexturePageChanged();
    713 }
    714 
    715 void GPU_HW::ClearVRAMDirtyRectangle()
    716 {
    717   m_vram_dirty_draw_rect = INVALID_RECT;
    718   m_vram_dirty_write_rect = INVALID_RECT;
    719 }
    720 
    721 void GPU_HW::AddWrittenRectangle(const GSVector4i rect)
    722 {
    723   m_vram_dirty_write_rect = m_vram_dirty_write_rect.runion(rect);
    724   SetTexPageChangedOnOverlap(m_vram_dirty_write_rect);
    725 }
    726 
    727 void GPU_HW::AddDrawnRectangle(const GSVector4i rect)
    728 {
    729   // Normally, we would check for overlap here. But the GPU's texture cache won't actually reload until the page
    730   // changes, or it samples a larger region, so we can get away without doing so. This reduces copies considerably in
    731   // games like Mega Man Legends 2.
    732   m_vram_dirty_draw_rect = m_vram_dirty_draw_rect.runion(rect);
    733 }
    734 
    735 void GPU_HW::AddUnclampedDrawnRectangle(const GSVector4i rect)
    736 {
    737   m_vram_dirty_draw_rect = m_vram_dirty_draw_rect.runion(rect);
    738   SetTexPageChangedOnOverlap(m_vram_dirty_draw_rect);
    739 }
    740 
    741 void GPU_HW::SetTexPageChangedOnOverlap(const GSVector4i update_rect)
    742 {
    743   // the vram area can include the texture page, but the game can leave it as-is. in this case, set it as dirty so the
    744   // shadow texture is updated
    745   if (!m_draw_mode.IsTexturePageChanged() && m_batch.texture_mode != BatchTextureMode::Disabled &&
    746       (m_draw_mode.mode_reg.GetTexturePageRectangle().rintersects(update_rect) ||
    747        (m_draw_mode.mode_reg.IsUsingPalette() &&
    748         m_draw_mode.palette_reg.GetRectangle(m_draw_mode.mode_reg.texture_mode).rintersects(update_rect))))
    749   {
    750     m_draw_mode.SetTexturePageChanged();
    751   }
    752 }
    753 
    754 std::tuple<u32, u32> GPU_HW::GetEffectiveDisplayResolution(bool scaled /* = true */)
    755 {
    756   const u32 scale = scaled ? m_resolution_scale : 1u;
    757   return std::make_tuple(m_crtc_state.display_vram_width * scale, m_crtc_state.display_vram_height * scale);
    758 }
    759 
    760 std::tuple<u32, u32> GPU_HW::GetFullDisplayResolution(bool scaled /* = true */)
    761 {
    762   const u32 scale = scaled ? m_resolution_scale : 1u;
    763   return std::make_tuple(m_crtc_state.display_width * scale, m_crtc_state.display_height * scale);
    764 }
    765 
    766 void GPU_HW::PrintSettingsToLog()
    767 {
    768   INFO_LOG("Resolution Scale: {} ({}x{}), maximum {}", m_resolution_scale, VRAM_WIDTH * m_resolution_scale,
    769            VRAM_HEIGHT * m_resolution_scale, GetMaxResolutionScale());
    770   INFO_LOG("Multisampling: {}x{}", m_multisamples,
    771            (g_settings.gpu_per_sample_shading && g_gpu_device->GetFeatures().per_sample_shading) ?
    772              " (per sample shading)" :
    773              "");
    774   INFO_LOG("Dithering: {}{}", m_true_color ? "Disabled" : "Enabled",
    775            (!m_true_color && g_settings.gpu_scaled_dithering) ?
    776              " (Scaled)" :
    777              ((m_true_color && g_settings.gpu_debanding) ? " (Debanding)" : ""));
    778   INFO_LOG("Force round texture coordinates: {}",
    779            (m_resolution_scale > 1 && g_settings.gpu_force_round_texcoords) ? "Enabled" : "Disabled");
    780   INFO_LOG("Texture Filtering: {}/{}", Settings::GetTextureFilterDisplayName(m_texture_filtering),
    781            Settings::GetTextureFilterDisplayName(m_sprite_texture_filtering));
    782   INFO_LOG("Dual-source blending: {}", m_supports_dual_source_blend ? "Supported" : "Not supported");
    783   INFO_LOG("Clamping UVs: {}", m_clamp_uvs ? "YES" : "NO");
    784   INFO_LOG("Depth buffer: {}", m_pgxp_depth_buffer ? "YES" : "NO");
    785   INFO_LOG("Downsampling: {}", Settings::GetDownsampleModeDisplayName(m_downsample_mode));
    786   INFO_LOG("Wireframe rendering: {}", Settings::GetGPUWireframeModeDisplayName(m_wireframe_mode));
    787   INFO_LOG("Line detection: {}", Settings::GetLineDetectModeDisplayName(m_line_detect_mode));
    788   INFO_LOG("Using software renderer for readbacks: {}", m_sw_renderer ? "YES" : "NO");
    789   INFO_LOG("Separate sprite shaders: {}", m_allow_sprite_mode ? "YES" : "NO");
    790 }
    791 
    792 GPUTexture::Format GPU_HW::GetDepthBufferFormat() const
    793 {
    794   // Use 32-bit depth for PGXP depth buffer, otherwise 16-bit for mask bit.
    795   return m_pgxp_depth_buffer ? (m_use_rov_for_shader_blend ? VRAM_DS_COLOR_FORMAT : VRAM_DS_DEPTH_FORMAT) :
    796                                VRAM_DS_FORMAT;
    797 }
    798 
    799 bool GPU_HW::CreateBuffers()
    800 {
    801   DestroyBuffers();
    802 
    803   // scale vram size to internal resolution
    804   const u32 texture_width = VRAM_WIDTH * m_resolution_scale;
    805   const u32 texture_height = VRAM_HEIGHT * m_resolution_scale;
    806   const u8 samples = static_cast<u8>(m_multisamples);
    807   const bool needs_depth_buffer = m_write_mask_as_depth || m_pgxp_depth_buffer;
    808 
    809   // Needed for Metal resolve.
    810   const GPUTexture::Type read_texture_type = (g_gpu_device->GetRenderAPI() == RenderAPI::Metal && m_multisamples > 1) ?
    811                                                GPUTexture::Type::RWTexture :
    812                                                GPUTexture::Type::Texture;
    813   const GPUTexture::Type vram_texture_type =
    814     m_use_rov_for_shader_blend ? GPUTexture::Type::RWTexture : GPUTexture::Type::RenderTarget;
    815   const GPUTexture::Type depth_texture_type =
    816     m_use_rov_for_shader_blend ? GPUTexture::Type::RWTexture : GPUTexture::Type::DepthStencil;
    817 
    818   if (!(m_vram_texture = g_gpu_device->FetchTexture(texture_width, texture_height, 1, 1, samples, vram_texture_type,
    819                                                     VRAM_RT_FORMAT)) ||
    820       (needs_depth_buffer &&
    821        !(m_vram_depth_texture = g_gpu_device->FetchTexture(texture_width, texture_height, 1, 1, samples,
    822                                                            depth_texture_type, GetDepthBufferFormat()))) ||
    823       (m_pgxp_depth_buffer && !(m_vram_depth_copy_texture =
    824                                   g_gpu_device->FetchTexture(texture_width, texture_height, 1, 1, samples,
    825                                                              GPUTexture::Type::RenderTarget, VRAM_DS_COLOR_FORMAT))) ||
    826       !(m_vram_read_texture =
    827           g_gpu_device->FetchTexture(texture_width, texture_height, 1, 1, 1, read_texture_type, VRAM_RT_FORMAT)) ||
    828       !(m_vram_readback_texture = g_gpu_device->FetchTexture(VRAM_WIDTH / 2, VRAM_HEIGHT, 1, 1, 1,
    829                                                              GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT)))
    830   {
    831     return false;
    832   }
    833 
    834   GL_OBJECT_NAME(m_vram_texture, "VRAM Texture");
    835   if (m_vram_depth_texture)
    836     GL_OBJECT_NAME(m_vram_depth_texture, "VRAM Depth Texture");
    837   GL_OBJECT_NAME(m_vram_read_texture, "VRAM Read Texture");
    838   GL_OBJECT_NAME(m_vram_readback_texture, "VRAM Readback Texture");
    839 
    840   if (g_gpu_device->GetFeatures().memory_import)
    841   {
    842     DEV_LOG("Trying to import guest VRAM buffer for downloads...");
    843     m_vram_readback_download_texture = g_gpu_device->CreateDownloadTexture(
    844       m_vram_readback_texture->GetWidth(), m_vram_readback_texture->GetHeight(), m_vram_readback_texture->GetFormat(),
    845       g_vram, sizeof(g_vram), VRAM_WIDTH * sizeof(u16));
    846     if (!m_vram_readback_download_texture)
    847       ERROR_LOG("Failed to create imported readback buffer");
    848   }
    849   if (!m_vram_readback_download_texture)
    850   {
    851     m_vram_readback_download_texture = g_gpu_device->CreateDownloadTexture(
    852       m_vram_readback_texture->GetWidth(), m_vram_readback_texture->GetHeight(), m_vram_readback_texture->GetFormat());
    853     if (!m_vram_readback_download_texture)
    854     {
    855       ERROR_LOG("Failed to create readback download texture");
    856       return false;
    857     }
    858   }
    859 
    860   if (g_gpu_device->GetFeatures().supports_texture_buffers)
    861   {
    862     if (!(m_vram_upload_buffer =
    863             g_gpu_device->CreateTextureBuffer(GPUTextureBuffer::Format::R16UI, GPUDevice::MIN_TEXEL_BUFFER_ELEMENTS)))
    864     {
    865       return false;
    866     }
    867 
    868     GL_OBJECT_NAME(m_vram_upload_buffer, "VRAM Upload Buffer");
    869   }
    870 
    871   INFO_LOG("Created HW framebuffer of {}x{}", texture_width, texture_height);
    872 
    873   SetVRAMRenderTarget();
    874   SetFullVRAMDirtyRectangle();
    875   return true;
    876 }
    877 
    878 void GPU_HW::ClearFramebuffer()
    879 {
    880   g_gpu_device->ClearRenderTarget(m_vram_texture.get(), 0);
    881   if (m_vram_depth_texture)
    882   {
    883     if (m_use_rov_for_shader_blend)
    884       g_gpu_device->ClearRenderTarget(m_vram_depth_texture.get(), 0xFF);
    885     else
    886       g_gpu_device->ClearDepth(m_vram_depth_texture.get(), m_pgxp_depth_buffer ? 1.0f : 0.0f);
    887   }
    888   ClearVRAMDirtyRectangle();
    889   m_last_depth_z = 1.0f;
    890 }
    891 
    892 void GPU_HW::SetVRAMRenderTarget()
    893 {
    894   if (m_use_rov_for_shader_blend)
    895   {
    896     GPUTexture* rts[2] = {m_vram_texture.get(), m_vram_depth_texture.get()};
    897     const u32 num_rts = m_pgxp_depth_buffer ? 2 : 1;
    898     g_gpu_device->SetRenderTargets(
    899       rts, num_rts, nullptr, m_rov_active ? GPUPipeline::BindRenderTargetsAsImages : GPUPipeline::NoRenderPassFlags);
    900   }
    901   else
    902   {
    903     g_gpu_device->SetRenderTarget(m_vram_texture.get(), m_vram_depth_texture.get(),
    904                                   ((m_allow_shader_blend && !m_use_rov_for_shader_blend) ?
    905                                      GPUPipeline::ColorFeedbackLoop :
    906                                      GPUPipeline::NoRenderPassFlags));
    907   }
    908 }
    909 
    910 void GPU_HW::DeactivateROV()
    911 {
    912   if (!m_rov_active)
    913     return;
    914 
    915   GL_INS("Deactivating ROV.");
    916   m_rov_active = false;
    917   SetVRAMRenderTarget();
    918 }
    919 
    920 void GPU_HW::DestroyBuffers()
    921 {
    922   ClearDisplayTexture();
    923 
    924   DebugAssert((m_batch_vertex_ptr != nullptr) == (m_batch_index_ptr != nullptr));
    925   if (m_batch_vertex_ptr)
    926     UnmapGPUBuffer(0, 0);
    927 
    928   m_vram_upload_buffer.reset();
    929   m_vram_readback_download_texture.reset();
    930   g_gpu_device->RecycleTexture(std::move(m_downsample_texture));
    931   g_gpu_device->RecycleTexture(std::move(m_vram_extract_depth_texture));
    932   g_gpu_device->RecycleTexture(std::move(m_vram_extract_texture));
    933   g_gpu_device->RecycleTexture(std::move(m_vram_read_texture));
    934   g_gpu_device->RecycleTexture(std::move(m_vram_depth_copy_texture));
    935   g_gpu_device->RecycleTexture(std::move(m_vram_depth_texture));
    936   g_gpu_device->RecycleTexture(std::move(m_vram_texture));
    937   g_gpu_device->RecycleTexture(std::move(m_vram_readback_texture));
    938 }
    939 
    940 bool GPU_HW::CompilePipelines(Error* error)
    941 {
    942   const GPUDevice::Features features = g_gpu_device->GetFeatures();
    943   const bool per_sample_shading = g_settings.gpu_per_sample_shading && features.per_sample_shading;
    944   const bool force_round_texcoords = (m_resolution_scale > 1 && m_texture_filtering == GPUTextureFilter::Nearest &&
    945                                       g_settings.gpu_force_round_texcoords);
    946 
    947   // Determine when to use shader blending.
    948   // FBFetch is free, we need it for filtering without DSB, or when accurate blending is forced.
    949   // But, don't bother with accurate blending if true colour is on. The result will be the same.
    950   // Prefer ROV over barriers/feedback loops without FBFetch, it'll be faster.
    951   // Abuse the depth buffer for the mask bit when it's free (FBFetch), or PGXP depth buffering is enabled.
    952   m_allow_shader_blend = features.framebuffer_fetch ||
    953                          ((features.feedback_loops || features.raster_order_views) &&
    954                           (m_pgxp_depth_buffer || g_settings.IsUsingAccurateBlending() ||
    955                            (!m_supports_dual_source_blend && (IsBlendedTextureFiltering(m_texture_filtering) ||
    956                                                               IsBlendedTextureFiltering(m_sprite_texture_filtering)))));
    957   m_prefer_shader_blend = (m_allow_shader_blend && g_settings.IsUsingAccurateBlending());
    958   m_use_rov_for_shader_blend = (m_allow_shader_blend && !features.framebuffer_fetch && features.raster_order_views &&
    959                                 (m_prefer_shader_blend || !features.feedback_loops));
    960   m_write_mask_as_depth = (!m_pgxp_depth_buffer && !features.framebuffer_fetch && !m_prefer_shader_blend);
    961 
    962   // ROV doesn't support MSAA in DirectX.
    963   Assert(!m_use_rov_for_shader_blend || !IsUsingMultisampling());
    964 
    965   const bool needs_depth_buffer = (m_pgxp_depth_buffer || m_write_mask_as_depth);
    966   const bool needs_rov_depth = (m_pgxp_depth_buffer && m_use_rov_for_shader_blend);
    967   const bool needs_real_depth_buffer = (needs_depth_buffer && !needs_rov_depth);
    968   const bool needs_feedback_loop = (m_allow_shader_blend && features.feedback_loops && !m_use_rov_for_shader_blend);
    969   const GPUTexture::Format depth_buffer_format =
    970     needs_depth_buffer ? GetDepthBufferFormat() : GPUTexture::Format::Unknown;
    971 
    972   // Logging in case something goes wrong.
    973   INFO_LOG("Shader blending allowed: {}", m_allow_shader_blend ? "YES" : "NO");
    974   INFO_LOG("Shader blending preferred: {}", m_prefer_shader_blend ? "YES" : "NO");
    975   INFO_LOG("Use ROV for shader blending: {}", m_use_rov_for_shader_blend ? "YES" : "NO");
    976   INFO_LOG("Write mask as depth: {}", m_write_mask_as_depth ? "YES" : "NO");
    977   INFO_LOG("Depth buffer is {}needed in {}.", needs_depth_buffer ? "" : "NOT ",
    978            GPUTexture::GetFormatName(GetDepthBufferFormat()));
    979   INFO_LOG("Using ROV depth: {}", needs_rov_depth ? "YES" : "NO");
    980   INFO_LOG("Using real depth buffer: {}", needs_real_depth_buffer ? "YES" : "NO");
    981   INFO_LOG("Using feedback loops: {}", needs_feedback_loop ? "YES" : "NO");
    982 
    983   // Start generating shaders.
    984   GPU_HW_ShaderGen shadergen(g_gpu_device->GetRenderAPI(), m_resolution_scale, m_multisamples, per_sample_shading,
    985                              m_true_color, (m_resolution_scale > 1 && g_settings.gpu_scaled_dithering),
    986                              m_write_mask_as_depth, ShouldDisableColorPerspective(), m_supports_dual_source_blend,
    987                              m_supports_framebuffer_fetch, g_settings.gpu_true_color && g_settings.gpu_debanding);
    988 
    989   const u32 active_texture_modes =
    990     m_allow_sprite_mode ? NUM_TEXTURE_MODES :
    991                           (NUM_TEXTURE_MODES - (NUM_TEXTURE_MODES - static_cast<u32>(BatchTextureMode::SpriteStart)));
    992   const u32 total_pipelines =
    993     (m_allow_sprite_mode ? 5 : 3) +                                                    // vertex shaders
    994     (active_texture_modes * 5 * 9 * 2 * 2 * 2 * (1 + BoolToUInt32(needs_rov_depth))) + // fragment shaders
    995     ((m_pgxp_depth_buffer ? 2 : 1) * 5 * 5 * active_texture_modes * 2 * 2 * 2) +       // batch pipelines
    996     ((m_wireframe_mode != GPUWireframeMode::Disabled) ? 1 : 0) +                       // wireframe
    997     1 +                                                                                // fullscreen quad VS
    998     (2 * 2) +                                                                          // vram fill
    999     (1 + BoolToUInt32(m_write_mask_as_depth)) +                                        // vram copy
   1000     (1 + BoolToUInt32(m_write_mask_as_depth)) +                                        // vram write
   1001     1 +                                                                                // vram write replacement
   1002     (m_write_mask_as_depth ? 1 : 0) +                                                  // mask -> depth
   1003     1 +                                                                                // vram read
   1004     2 +                                                                                // extract/display
   1005     ((m_downsample_mode != GPUDownsampleMode::Disabled) ? 1 : 0);                      // downsample
   1006 
   1007   ShaderCompileProgressTracker progress("Compiling Pipelines", total_pipelines);
   1008 
   1009   // vertex shaders - [textured/palette/sprite]
   1010   // fragment shaders - [depth_test][render_mode][transparency_mode][texture_mode][check_mask][dithering][interlacing]
   1011   static constexpr auto destroy_shader = [](std::unique_ptr<GPUShader>& s) { s.reset(); };
   1012   DimensionalArray<std::unique_ptr<GPUShader>, 2, 2, 2> batch_vertex_shaders{};
   1013   DimensionalArray<std::unique_ptr<GPUShader>, 2, 2, 2, NUM_TEXTURE_MODES, 5, 5, 2> batch_fragment_shaders{};
   1014   ScopedGuard batch_shader_guard([&batch_vertex_shaders, &batch_fragment_shaders]() {
   1015     batch_vertex_shaders.enumerate(destroy_shader);
   1016     batch_fragment_shaders.enumerate(destroy_shader);
   1017   });
   1018 
   1019   for (u8 textured = 0; textured < 2; textured++)
   1020   {
   1021     for (u8 palette = 0; palette < (textured ? 2 : 1); palette++)
   1022     {
   1023       for (u8 sprite = 0; sprite < (textured ? 2 : 1); sprite++)
   1024       {
   1025         const bool uv_limits = ShouldClampUVs(sprite ? m_sprite_texture_filtering : m_texture_filtering);
   1026         const std::string vs = shadergen.GenerateBatchVertexShader(
   1027           textured != 0, palette != 0, uv_limits, !sprite && force_round_texcoords, m_pgxp_depth_buffer);
   1028         if (!(batch_vertex_shaders[textured][palette][sprite] =
   1029                 g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(), vs, error)))
   1030         {
   1031           return false;
   1032         }
   1033 
   1034         progress.Increment();
   1035       }
   1036     }
   1037   }
   1038 
   1039   for (u8 depth_test = 0; depth_test < 2; depth_test++)
   1040   {
   1041     if (depth_test && !needs_rov_depth)
   1042     {
   1043       // Don't need to do depth testing in the shader.
   1044       continue;
   1045     }
   1046 
   1047     for (u8 render_mode = 0; render_mode < 5; render_mode++)
   1048     {
   1049       for (u8 transparency_mode = 0; transparency_mode < 5; transparency_mode++)
   1050       {
   1051         if (
   1052           // Can't generate shader blending.
   1053           ((render_mode == static_cast<u8>(BatchRenderMode::ShaderBlend) && !m_allow_shader_blend) ||
   1054            (render_mode != static_cast<u8>(BatchRenderMode::ShaderBlend) &&
   1055             transparency_mode != static_cast<u8>(GPUTransparencyMode::Disabled))) ||
   1056           // Don't need multipass shaders if we're preferring shader blend or have (free) FBFetch.
   1057           ((m_supports_framebuffer_fetch || m_prefer_shader_blend) &&
   1058            (render_mode == static_cast<u8>(BatchRenderMode::OnlyOpaque) ||
   1059             render_mode == static_cast<u8>(BatchRenderMode::OnlyTransparent))) ||
   1060           // If using ROV depth, we only draw with shader blending.
   1061           (needs_rov_depth && render_mode != static_cast<u8>(BatchRenderMode::ShaderBlend)))
   1062         {
   1063           progress.Increment(active_texture_modes * 2 * 2 * 2);
   1064           continue;
   1065         }
   1066 
   1067         for (u8 texture_mode = 0; texture_mode < active_texture_modes; texture_mode++)
   1068         {
   1069           for (u8 check_mask = 0; check_mask < 2; check_mask++)
   1070           {
   1071             if (check_mask && render_mode != static_cast<u8>(BatchRenderMode::ShaderBlend))
   1072             {
   1073               // mask bit testing is only valid with shader blending.
   1074               progress.Increment(2 * 2);
   1075               continue;
   1076             }
   1077 
   1078             for (u8 dithering = 0; dithering < 2; dithering++)
   1079             {
   1080               for (u8 interlacing = 0; interlacing < 2; interlacing++)
   1081               {
   1082                 const bool sprite = (static_cast<BatchTextureMode>(texture_mode) >= BatchTextureMode::SpriteStart);
   1083                 const bool uv_limits = ShouldClampUVs(sprite ? m_sprite_texture_filtering : m_texture_filtering);
   1084                 const BatchTextureMode shader_texmode = static_cast<BatchTextureMode>(
   1085                   texture_mode - (sprite ? static_cast<u8>(BatchTextureMode::SpriteStart) : 0));
   1086                 const bool use_rov =
   1087                   (render_mode == static_cast<u8>(BatchRenderMode::ShaderBlend) && m_use_rov_for_shader_blend);
   1088                 const std::string fs = shadergen.GenerateBatchFragmentShader(
   1089                   static_cast<BatchRenderMode>(render_mode), static_cast<GPUTransparencyMode>(transparency_mode),
   1090                   shader_texmode, sprite ? m_sprite_texture_filtering : m_texture_filtering, uv_limits,
   1091                   !sprite && force_round_texcoords, ConvertToBoolUnchecked(dithering),
   1092                   ConvertToBoolUnchecked(interlacing), ConvertToBoolUnchecked(check_mask), use_rov, needs_rov_depth,
   1093                   (depth_test != 0));
   1094 
   1095                 if (!(batch_fragment_shaders[depth_test][render_mode][transparency_mode][texture_mode][check_mask]
   1096                                             [dithering][interlacing] = g_gpu_device->CreateShader(
   1097                                               GPUShaderStage::Fragment, shadergen.GetLanguage(), fs, error)))
   1098                 {
   1099                   return false;
   1100                 }
   1101 
   1102                 progress.Increment();
   1103               }
   1104             }
   1105           }
   1106         }
   1107       }
   1108     }
   1109   }
   1110 
   1111   static constexpr GPUPipeline::VertexAttribute vertex_attributes[] = {
   1112     GPUPipeline::VertexAttribute::Make(0, GPUPipeline::VertexAttribute::Semantic::Position, 0,
   1113                                        GPUPipeline::VertexAttribute::Type::Float, 4, OFFSETOF(BatchVertex, x)),
   1114     GPUPipeline::VertexAttribute::Make(1, GPUPipeline::VertexAttribute::Semantic::Color, 0,
   1115                                        GPUPipeline::VertexAttribute::Type::UNorm8, 4, OFFSETOF(BatchVertex, color)),
   1116     GPUPipeline::VertexAttribute::Make(2, GPUPipeline::VertexAttribute::Semantic::TexCoord, 0,
   1117                                        GPUPipeline::VertexAttribute::Type::UInt32, 1, OFFSETOF(BatchVertex, u)),
   1118     GPUPipeline::VertexAttribute::Make(3, GPUPipeline::VertexAttribute::Semantic::TexCoord, 1,
   1119                                        GPUPipeline::VertexAttribute::Type::UInt32, 1, OFFSETOF(BatchVertex, texpage)),
   1120     GPUPipeline::VertexAttribute::Make(4, GPUPipeline::VertexAttribute::Semantic::TexCoord, 2,
   1121                                        GPUPipeline::VertexAttribute::Type::UNorm8, 4, OFFSETOF(BatchVertex, uv_limits)),
   1122   };
   1123   static constexpr u32 NUM_BATCH_VERTEX_ATTRIBUTES = 2;
   1124   static constexpr u32 NUM_BATCH_TEXTURED_VERTEX_ATTRIBUTES = 4;
   1125   static constexpr u32 NUM_BATCH_TEXTURED_LIMITS_VERTEX_ATTRIBUTES = 5;
   1126 
   1127   GPUPipeline::GraphicsConfig plconfig = {};
   1128   plconfig.layout = GPUPipeline::Layout::SingleTextureAndUBO;
   1129   plconfig.input_layout.vertex_stride = sizeof(BatchVertex);
   1130   plconfig.rasterization = GPUPipeline::RasterizationState::GetNoCullState();
   1131   plconfig.primitive = GPUPipeline::Primitive::Triangles;
   1132   plconfig.geometry_shader = nullptr;
   1133   plconfig.samples = m_multisamples;
   1134   plconfig.per_sample_shading = per_sample_shading;
   1135   plconfig.depth = GPUPipeline::DepthState::GetNoTestsState();
   1136 
   1137   // [depth_test][transparency_mode][render_mode][texture_mode][dithering][interlacing][check_mask]
   1138   for (u8 depth_test = 0; depth_test < 2; depth_test++)
   1139   {
   1140     if (depth_test && !m_pgxp_depth_buffer)
   1141     {
   1142       // Not used.
   1143       continue;
   1144     }
   1145 
   1146     for (u8 transparency_mode = 0; transparency_mode < 5; transparency_mode++)
   1147     {
   1148       for (u8 render_mode = 0; render_mode < 5; render_mode++)
   1149       {
   1150         if (
   1151           // Can't generate shader blending.
   1152           (render_mode == static_cast<u8>(BatchRenderMode::ShaderBlend) && !m_allow_shader_blend) ||
   1153           // Don't need multipass shaders.
   1154           ((m_supports_framebuffer_fetch || m_prefer_shader_blend) &&
   1155            (render_mode == static_cast<u8>(BatchRenderMode::OnlyOpaque) ||
   1156             render_mode == static_cast<u8>(BatchRenderMode::OnlyTransparent))) ||
   1157           // If using ROV depth, we only draw with shader blending.
   1158           (needs_rov_depth && render_mode != static_cast<u8>(BatchRenderMode::ShaderBlend)))
   1159         {
   1160           progress.Increment(9 * 2 * 2 * 2);
   1161           continue;
   1162         }
   1163 
   1164         for (u8 texture_mode = 0; texture_mode < active_texture_modes; texture_mode++)
   1165         {
   1166           for (u8 dithering = 0; dithering < 2; dithering++)
   1167           {
   1168             for (u8 interlacing = 0; interlacing < 2; interlacing++)
   1169             {
   1170               for (u8 check_mask = 0; check_mask < 2; check_mask++)
   1171               {
   1172                 const bool textured = (static_cast<BatchTextureMode>(texture_mode) != BatchTextureMode::Disabled);
   1173                 const bool palette =
   1174                   (static_cast<BatchTextureMode>(texture_mode) == BatchTextureMode::Palette4Bit ||
   1175                    static_cast<BatchTextureMode>(texture_mode) == BatchTextureMode::Palette8Bit ||
   1176                    static_cast<BatchTextureMode>(texture_mode) == BatchTextureMode::SpritePalette4Bit ||
   1177                    static_cast<BatchTextureMode>(texture_mode) == BatchTextureMode::SpritePalette8Bit);
   1178                 const bool sprite = (static_cast<BatchTextureMode>(texture_mode) >= BatchTextureMode::SpriteStart);
   1179                 const bool uv_limits = ShouldClampUVs(sprite ? m_sprite_texture_filtering : m_texture_filtering);
   1180                 const bool use_shader_blending = (render_mode == static_cast<u8>(BatchRenderMode::ShaderBlend));
   1181                 const bool use_rov = (use_shader_blending && m_use_rov_for_shader_blend);
   1182                 plconfig.input_layout.vertex_attributes =
   1183                   textured ?
   1184                     (uv_limits ? std::span<const GPUPipeline::VertexAttribute>(
   1185                                    vertex_attributes, NUM_BATCH_TEXTURED_LIMITS_VERTEX_ATTRIBUTES) :
   1186                                  std::span<const GPUPipeline::VertexAttribute>(vertex_attributes,
   1187                                                                                NUM_BATCH_TEXTURED_VERTEX_ATTRIBUTES)) :
   1188                     std::span<const GPUPipeline::VertexAttribute>(vertex_attributes, NUM_BATCH_VERTEX_ATTRIBUTES);
   1189 
   1190                 plconfig.vertex_shader =
   1191                   batch_vertex_shaders[BoolToUInt8(textured)][BoolToUInt8(palette)][BoolToUInt8(sprite)].get();
   1192                 plconfig.fragment_shader =
   1193                   batch_fragment_shaders[BoolToUInt8(depth_test && needs_rov_depth)][render_mode]
   1194                                         [use_shader_blending ? transparency_mode :
   1195                                                                static_cast<u8>(GPUTransparencyMode::Disabled)]
   1196                                         [texture_mode][use_shader_blending ? check_mask : 0][dithering][interlacing]
   1197                                           .get();
   1198                 Assert(plconfig.vertex_shader && plconfig.fragment_shader);
   1199 
   1200                 if (needs_real_depth_buffer)
   1201                 {
   1202                   plconfig.depth.depth_test =
   1203                     m_pgxp_depth_buffer ?
   1204                       (depth_test ? GPUPipeline::DepthFunc::LessEqual : GPUPipeline::DepthFunc::Always) :
   1205                       (check_mask ? GPUPipeline::DepthFunc::GreaterEqual : GPUPipeline::DepthFunc::Always);
   1206 
   1207                   // Don't write for transparent, but still test.
   1208                   plconfig.depth.depth_write =
   1209                     !m_pgxp_depth_buffer ||
   1210                     (depth_test && transparency_mode == static_cast<u8>(GPUTransparencyMode::Disabled));
   1211                 }
   1212 
   1213                 plconfig.SetTargetFormats(use_rov ? GPUTexture::Format::Unknown : VRAM_RT_FORMAT,
   1214                                           needs_rov_depth ? GPUTexture::Format::Unknown : depth_buffer_format);
   1215                 plconfig.color_formats[1] = needs_rov_depth ? VRAM_DS_COLOR_FORMAT : GPUTexture::Format::Unknown;
   1216                 plconfig.render_pass_flags =
   1217                   use_rov ? GPUPipeline::BindRenderTargetsAsImages :
   1218                             (needs_feedback_loop ? GPUPipeline::ColorFeedbackLoop : GPUPipeline::NoRenderPassFlags);
   1219 
   1220                 plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
   1221 
   1222                 if (use_rov)
   1223                 {
   1224                   plconfig.blend.write_mask = 0;
   1225                 }
   1226                 else if (!use_shader_blending &&
   1227                          ((static_cast<GPUTransparencyMode>(transparency_mode) != GPUTransparencyMode::Disabled &&
   1228                            (static_cast<BatchRenderMode>(render_mode) != BatchRenderMode::TransparencyDisabled &&
   1229                             static_cast<BatchRenderMode>(render_mode) != BatchRenderMode::OnlyOpaque)) ||
   1230                           (textured &&
   1231                            IsBlendedTextureFiltering(sprite ? m_sprite_texture_filtering : m_texture_filtering))))
   1232                 {
   1233                   plconfig.blend.enable = true;
   1234                   plconfig.blend.src_alpha_blend = GPUPipeline::BlendFunc::One;
   1235                   plconfig.blend.dst_alpha_blend = GPUPipeline::BlendFunc::Zero;
   1236                   plconfig.blend.alpha_blend_op = GPUPipeline::BlendOp::Add;
   1237 
   1238                   if (m_supports_dual_source_blend)
   1239                   {
   1240                     plconfig.blend.src_blend = GPUPipeline::BlendFunc::One;
   1241                     plconfig.blend.dst_blend = GPUPipeline::BlendFunc::SrcAlpha1;
   1242                     plconfig.blend.blend_op =
   1243                       (static_cast<GPUTransparencyMode>(transparency_mode) ==
   1244                          GPUTransparencyMode::BackgroundMinusForeground &&
   1245                        static_cast<BatchRenderMode>(render_mode) != BatchRenderMode::TransparencyDisabled &&
   1246                        static_cast<BatchRenderMode>(render_mode) != BatchRenderMode::OnlyOpaque) ?
   1247                         GPUPipeline::BlendOp::ReverseSubtract :
   1248                         GPUPipeline::BlendOp::Add;
   1249                   }
   1250                   else
   1251                   {
   1252                     // TODO: This isn't entirely accurate, 127.5 versus 128.
   1253                     // But if we use fbfetch on Mali, it doesn't matter.
   1254                     plconfig.blend.src_blend = GPUPipeline::BlendFunc::One;
   1255                     plconfig.blend.dst_blend = GPUPipeline::BlendFunc::One;
   1256                     if (static_cast<GPUTransparencyMode>(transparency_mode) ==
   1257                         GPUTransparencyMode::HalfBackgroundPlusHalfForeground)
   1258                     {
   1259                       plconfig.blend.dst_blend = GPUPipeline::BlendFunc::ConstantColor;
   1260                       plconfig.blend.dst_alpha_blend = GPUPipeline::BlendFunc::ConstantColor;
   1261                       plconfig.blend.constant = 0x00808080u;
   1262                     }
   1263 
   1264                     plconfig.blend.blend_op =
   1265                       (static_cast<GPUTransparencyMode>(transparency_mode) ==
   1266                          GPUTransparencyMode::BackgroundMinusForeground &&
   1267                        static_cast<BatchRenderMode>(render_mode) != BatchRenderMode::TransparencyDisabled &&
   1268                        static_cast<BatchRenderMode>(render_mode) != BatchRenderMode::OnlyOpaque) ?
   1269                         GPUPipeline::BlendOp::ReverseSubtract :
   1270                         GPUPipeline::BlendOp::Add;
   1271                   }
   1272                 }
   1273 
   1274                 if (!(m_batch_pipelines[depth_test][transparency_mode][render_mode][texture_mode][dithering]
   1275                                        [interlacing][check_mask] = g_gpu_device->CreatePipeline(plconfig, error)))
   1276                 {
   1277                   return false;
   1278                 }
   1279 
   1280                 progress.Increment();
   1281               }
   1282             }
   1283           }
   1284         }
   1285       }
   1286     }
   1287   }
   1288 
   1289   plconfig.SetTargetFormats(VRAM_RT_FORMAT, needs_rov_depth ? GPUTexture::Format::Unknown : depth_buffer_format);
   1290   plconfig.render_pass_flags = needs_feedback_loop ? GPUPipeline::ColorFeedbackLoop : GPUPipeline::NoRenderPassFlags;
   1291 
   1292   if (m_wireframe_mode != GPUWireframeMode::Disabled)
   1293   {
   1294     std::unique_ptr<GPUShader> gs = g_gpu_device->CreateShader(GPUShaderStage::Geometry, shadergen.GetLanguage(),
   1295                                                                shadergen.GenerateWireframeGeometryShader(), error);
   1296     std::unique_ptr<GPUShader> fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1297                                                                shadergen.GenerateWireframeFragmentShader(), error);
   1298     if (!gs || !fs)
   1299       return false;
   1300 
   1301     GL_OBJECT_NAME(gs, "Batch Wireframe Geometry Shader");
   1302     GL_OBJECT_NAME(fs, "Batch Wireframe Fragment Shader");
   1303 
   1304     plconfig.input_layout.vertex_attributes =
   1305       std::span<const GPUPipeline::VertexAttribute>(vertex_attributes, NUM_BATCH_VERTEX_ATTRIBUTES);
   1306     plconfig.blend = (m_wireframe_mode == GPUWireframeMode::OverlayWireframe) ?
   1307                        GPUPipeline::BlendState::GetAlphaBlendingState() :
   1308                        GPUPipeline::BlendState::GetNoBlendingState();
   1309     plconfig.blend.write_mask = 0x7;
   1310     plconfig.depth = GPUPipeline::DepthState::GetNoTestsState();
   1311     plconfig.vertex_shader = batch_vertex_shaders[0][0][0].get();
   1312     plconfig.geometry_shader = gs.get();
   1313     plconfig.fragment_shader = fs.get();
   1314 
   1315     if (!(m_wireframe_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1316       return false;
   1317 
   1318     GL_OBJECT_NAME(m_wireframe_pipeline, "Batch Wireframe Pipeline");
   1319 
   1320     plconfig.vertex_shader = nullptr;
   1321     plconfig.geometry_shader = nullptr;
   1322     plconfig.fragment_shader = nullptr;
   1323 
   1324     progress.Increment();
   1325   }
   1326 
   1327   batch_shader_guard.Run();
   1328 
   1329   // use a depth of 1, that way writes will reset the depth
   1330   std::unique_ptr<GPUShader> fullscreen_quad_vertex_shader = g_gpu_device->CreateShader(
   1331     GPUShaderStage::Vertex, shadergen.GetLanguage(), shadergen.GenerateScreenQuadVertexShader(1.0f), error);
   1332   if (!fullscreen_quad_vertex_shader)
   1333     return false;
   1334 
   1335   progress.Increment();
   1336 
   1337   // common state
   1338   plconfig.input_layout.vertex_attributes = {};
   1339   plconfig.input_layout.vertex_stride = 0;
   1340   plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
   1341   plconfig.per_sample_shading = false;
   1342   plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
   1343   plconfig.vertex_shader = fullscreen_quad_vertex_shader.get();
   1344   plconfig.color_formats[1] = needs_rov_depth ? VRAM_DS_COLOR_FORMAT : GPUTexture::Format::Unknown;
   1345 
   1346   // VRAM fill
   1347   for (u8 wrapped = 0; wrapped < 2; wrapped++)
   1348   {
   1349     for (u8 interlaced = 0; interlaced < 2; interlaced++)
   1350     {
   1351       std::unique_ptr<GPUShader> fs = g_gpu_device->CreateShader(
   1352         GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1353         shadergen.GenerateVRAMFillFragmentShader(ConvertToBoolUnchecked(wrapped), ConvertToBoolUnchecked(interlaced)),
   1354         error);
   1355       if (!fs)
   1356         return false;
   1357 
   1358       plconfig.fragment_shader = fs.get();
   1359       plconfig.depth = needs_real_depth_buffer ? GPUPipeline::DepthState::GetAlwaysWriteState() :
   1360                                                  GPUPipeline::DepthState::GetNoTestsState();
   1361 
   1362       if (!(m_vram_fill_pipelines[wrapped][interlaced] = g_gpu_device->CreatePipeline(plconfig, error)))
   1363         return false;
   1364 
   1365       progress.Increment();
   1366     }
   1367   }
   1368 
   1369   // VRAM copy
   1370   {
   1371     std::unique_ptr<GPUShader> fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1372                                                                shadergen.GenerateVRAMCopyFragmentShader(), error);
   1373     if (!fs)
   1374       return false;
   1375 
   1376     plconfig.fragment_shader = fs.get();
   1377     for (u8 depth_test = 0; depth_test < 2; depth_test++)
   1378     {
   1379       if (depth_test && !m_write_mask_as_depth)
   1380         continue;
   1381 
   1382       plconfig.depth.depth_write = needs_real_depth_buffer;
   1383       plconfig.depth.depth_test =
   1384         (depth_test != 0) ? GPUPipeline::DepthFunc::GreaterEqual : GPUPipeline::DepthFunc::Always;
   1385 
   1386       if (!(m_vram_copy_pipelines[depth_test] = g_gpu_device->CreatePipeline(plconfig), error))
   1387         return false;
   1388 
   1389       GL_OBJECT_NAME_FMT(m_vram_copy_pipelines[depth_test], "VRAM Write Pipeline, depth={}", depth_test);
   1390 
   1391       progress.Increment();
   1392     }
   1393   }
   1394 
   1395   // VRAM write
   1396   {
   1397     const bool use_buffer = features.supports_texture_buffers;
   1398     const bool use_ssbo = features.texture_buffers_emulated_with_ssbo;
   1399     std::unique_ptr<GPUShader> fs =
   1400       g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1401                                  shadergen.GenerateVRAMWriteFragmentShader(use_buffer, use_ssbo), error);
   1402     if (!fs)
   1403       return false;
   1404 
   1405     plconfig.layout = use_buffer ? GPUPipeline::Layout::SingleTextureBufferAndPushConstants :
   1406                                    GPUPipeline::Layout::SingleTextureAndPushConstants;
   1407     plconfig.fragment_shader = fs.get();
   1408     for (u8 depth_test = 0; depth_test < 2; depth_test++)
   1409     {
   1410       if (depth_test && !m_write_mask_as_depth)
   1411         continue;
   1412 
   1413       plconfig.depth.depth_write = needs_real_depth_buffer;
   1414       plconfig.depth.depth_test =
   1415         (depth_test != 0) ? GPUPipeline::DepthFunc::GreaterEqual : GPUPipeline::DepthFunc::Always;
   1416 
   1417       if (!(m_vram_write_pipelines[depth_test] = g_gpu_device->CreatePipeline(plconfig, error)))
   1418         return false;
   1419 
   1420       GL_OBJECT_NAME_FMT(m_vram_write_pipelines[depth_test], "VRAM Write Pipeline, depth={}", depth_test);
   1421 
   1422       progress.Increment();
   1423     }
   1424   }
   1425 
   1426   plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
   1427 
   1428   // VRAM write replacement
   1429   {
   1430     std::unique_ptr<GPUShader> fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1431                                                                shadergen.GenerateCopyFragmentShader(), error);
   1432     if (!fs)
   1433       return false;
   1434 
   1435     plconfig.fragment_shader = fs.get();
   1436     plconfig.depth = GPUPipeline::DepthState::GetNoTestsState();
   1437     if (!(m_vram_write_replacement_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1438       return false;
   1439 
   1440     progress.Increment();
   1441   }
   1442 
   1443   // VRAM update depth
   1444   if (m_write_mask_as_depth)
   1445   {
   1446     std::unique_ptr<GPUShader> fs = g_gpu_device->CreateShader(
   1447       GPUShaderStage::Fragment, shadergen.GetLanguage(), shadergen.GenerateVRAMUpdateDepthFragmentShader(), error);
   1448     if (!fs)
   1449       return false;
   1450 
   1451     plconfig.fragment_shader = fs.get();
   1452     plconfig.SetTargetFormats(GPUTexture::Format::Unknown, depth_buffer_format);
   1453     plconfig.depth = GPUPipeline::DepthState::GetAlwaysWriteState();
   1454     plconfig.blend.write_mask = 0;
   1455 
   1456     if (!(m_vram_update_depth_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1457       return false;
   1458 
   1459     GL_OBJECT_NAME(m_vram_update_depth_pipeline, "VRAM Update Depth Pipeline");
   1460 
   1461     progress.Increment();
   1462   }
   1463 
   1464   plconfig.SetTargetFormats(VRAM_RT_FORMAT);
   1465   plconfig.render_pass_flags = GPUPipeline::NoRenderPassFlags;
   1466   plconfig.depth = GPUPipeline::DepthState::GetNoTestsState();
   1467   plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
   1468   plconfig.samples = 1;
   1469   plconfig.per_sample_shading = false;
   1470 
   1471   // VRAM read
   1472   {
   1473     std::unique_ptr<GPUShader> fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1474                                                                shadergen.GenerateVRAMReadFragmentShader(), error);
   1475     if (!fs)
   1476       return false;
   1477 
   1478     plconfig.fragment_shader = fs.get();
   1479 
   1480     if (!(m_vram_readback_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1481       return false;
   1482 
   1483     GL_OBJECT_NAME(m_vram_readback_pipeline, "VRAM Read Pipeline");
   1484     progress.Increment();
   1485   }
   1486 
   1487   // Display
   1488   {
   1489     for (u8 shader = 0; shader < 3; shader++)
   1490     {
   1491       // 24-bit doesn't give you a depth buffer.
   1492       const bool color_24bit = (shader == 1);
   1493       const bool depth_extract = (shader == 2);
   1494       if (depth_extract && !m_pgxp_depth_buffer)
   1495         continue;
   1496 
   1497       std::unique_ptr<GPUShader> fs =
   1498         g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1499                                    shadergen.GenerateVRAMExtractFragmentShader(color_24bit, depth_extract), error);
   1500       if (!fs)
   1501         return false;
   1502 
   1503       plconfig.fragment_shader = fs.get();
   1504 
   1505       plconfig.layout = depth_extract ? GPUPipeline::Layout::MultiTextureAndPushConstants :
   1506                                         GPUPipeline::Layout::SingleTextureAndPushConstants;
   1507       plconfig.color_formats[1] = depth_extract ? VRAM_DS_COLOR_FORMAT : GPUTexture::Format::Unknown;
   1508 
   1509       if (!(m_vram_extract_pipeline[shader] = g_gpu_device->CreatePipeline(plconfig, error)))
   1510         return false;
   1511 
   1512       progress.Increment();
   1513     }
   1514   }
   1515 
   1516   plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants;
   1517 
   1518   if (m_pgxp_depth_buffer)
   1519   {
   1520     std::unique_ptr<GPUShader> fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1521                                                                shadergen.GenerateCopyFragmentShader(), error);
   1522     if (!fs)
   1523       return false;
   1524 
   1525     plconfig.fragment_shader = fs.get();
   1526     plconfig.SetTargetFormats(VRAM_DS_COLOR_FORMAT);
   1527     if (!(m_copy_depth_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1528       return false;
   1529   }
   1530 
   1531   plconfig.SetTargetFormats(VRAM_RT_FORMAT);
   1532 
   1533   if (m_downsample_mode == GPUDownsampleMode::Adaptive)
   1534   {
   1535     std::unique_ptr<GPUShader> vs = g_gpu_device->CreateShader(
   1536       GPUShaderStage::Vertex, shadergen.GetLanguage(), shadergen.GenerateAdaptiveDownsampleVertexShader(), error);
   1537     std::unique_ptr<GPUShader> fs =
   1538       g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1539                                  shadergen.GenerateAdaptiveDownsampleMipFragmentShader(true), error);
   1540     if (!vs || !fs)
   1541       return false;
   1542     GL_OBJECT_NAME(fs, "Downsample Vertex Shader");
   1543     GL_OBJECT_NAME(fs, "Downsample First Pass Fragment Shader");
   1544     plconfig.vertex_shader = vs.get();
   1545     plconfig.fragment_shader = fs.get();
   1546     if (!(m_downsample_first_pass_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1547       return false;
   1548     GL_OBJECT_NAME(m_downsample_first_pass_pipeline, "Downsample First Pass Pipeline");
   1549 
   1550     fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1551                                     shadergen.GenerateAdaptiveDownsampleMipFragmentShader(false), error);
   1552     if (!fs)
   1553       return false;
   1554     GL_OBJECT_NAME(fs, "Downsample Mid Pass Fragment Shader");
   1555     plconfig.fragment_shader = fs.get();
   1556     if (!(m_downsample_mid_pass_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1557       return false;
   1558     GL_OBJECT_NAME(m_downsample_mid_pass_pipeline, "Downsample Mid Pass Pipeline");
   1559 
   1560     fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1561                                     shadergen.GenerateAdaptiveDownsampleBlurFragmentShader(), error);
   1562     if (!fs)
   1563       return false;
   1564     GL_OBJECT_NAME(fs, "Downsample Blur Pass Fragment Shader");
   1565     plconfig.fragment_shader = fs.get();
   1566     plconfig.SetTargetFormats(GPUTexture::Format::R8);
   1567     if (!(m_downsample_blur_pass_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1568       return false;
   1569     GL_OBJECT_NAME(m_downsample_blur_pass_pipeline, "Downsample Blur Pass Pipeline");
   1570 
   1571     fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1572                                     shadergen.GenerateAdaptiveDownsampleCompositeFragmentShader(), error);
   1573     if (!fs)
   1574       return false;
   1575     GL_OBJECT_NAME(fs, "Downsample Composite Pass Fragment Shader");
   1576     plconfig.layout = GPUPipeline::Layout::MultiTextureAndPushConstants;
   1577     plconfig.fragment_shader = fs.get();
   1578     plconfig.SetTargetFormats(VRAM_RT_FORMAT);
   1579     if (!(m_downsample_composite_pass_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1580       return false;
   1581     GL_OBJECT_NAME(m_downsample_composite_pass_pipeline, "Downsample Blur Pass Pipeline");
   1582 
   1583     GPUSampler::Config config = GPUSampler::GetLinearConfig();
   1584     config.min_lod = 0;
   1585     config.max_lod = GPUSampler::Config::LOD_MAX;
   1586     if (!(m_downsample_lod_sampler = g_gpu_device->CreateSampler(config)))
   1587     {
   1588       Error::SetStringView(error, "Failed to create downsample LOD sampler.");
   1589       return false;
   1590     }
   1591     GL_OBJECT_NAME(m_downsample_lod_sampler, "Downsample LOD Sampler");
   1592     config.mip_filter = GPUSampler::Filter::Linear;
   1593     if (!(m_downsample_composite_sampler = g_gpu_device->CreateSampler(config)))
   1594     {
   1595       Error::SetStringView(error, "Failed to create downsample composite sampler.");
   1596       return false;
   1597     }
   1598     GL_OBJECT_NAME(m_downsample_composite_sampler, "Downsample Trilinear Sampler");
   1599     progress.Increment();
   1600   }
   1601   else if (m_downsample_mode == GPUDownsampleMode::Box)
   1602   {
   1603     std::unique_ptr<GPUShader> fs =
   1604       g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(),
   1605                                  shadergen.GenerateBoxSampleDownsampleFragmentShader(
   1606                                    m_resolution_scale / GetBoxDownsampleScale(m_resolution_scale)),
   1607                                  error);
   1608     if (!fs)
   1609       return false;
   1610 
   1611     GL_OBJECT_NAME(fs, "Downsample First Pass Fragment Shader");
   1612     plconfig.fragment_shader = fs.get();
   1613 
   1614     if (!(m_downsample_first_pass_pipeline = g_gpu_device->CreatePipeline(plconfig, error)))
   1615       return false;
   1616 
   1617     GL_OBJECT_NAME(m_downsample_first_pass_pipeline, "Downsample First Pass Pipeline");
   1618     progress.Increment();
   1619   }
   1620 
   1621 #undef UPDATE_PROGRESS
   1622 
   1623   return true;
   1624 }
   1625 
   1626 void GPU_HW::DestroyPipelines()
   1627 {
   1628   static constexpr auto destroy = [](std::unique_ptr<GPUPipeline>& p) { p.reset(); };
   1629 
   1630   m_wireframe_pipeline.reset();
   1631 
   1632   m_batch_pipelines.enumerate(destroy);
   1633 
   1634   m_vram_fill_pipelines.enumerate(destroy);
   1635 
   1636   for (std::unique_ptr<GPUPipeline>& p : m_vram_write_pipelines)
   1637     destroy(p);
   1638 
   1639   for (std::unique_ptr<GPUPipeline>& p : m_vram_copy_pipelines)
   1640     destroy(p);
   1641 
   1642   for (std::unique_ptr<GPUPipeline>& p : m_vram_extract_pipeline)
   1643     destroy(p);
   1644 
   1645   destroy(m_vram_readback_pipeline);
   1646   destroy(m_vram_update_depth_pipeline);
   1647   destroy(m_vram_write_replacement_pipeline);
   1648 
   1649   destroy(m_downsample_first_pass_pipeline);
   1650   destroy(m_downsample_mid_pass_pipeline);
   1651   destroy(m_downsample_blur_pass_pipeline);
   1652   destroy(m_downsample_composite_pass_pipeline);
   1653   m_downsample_composite_sampler.reset();
   1654 
   1655   m_copy_depth_pipeline.reset();
   1656 }
   1657 
   1658 GPU_HW::BatchRenderMode GPU_HW::BatchConfig::GetRenderMode() const
   1659 {
   1660   return transparency_mode == GPUTransparencyMode::Disabled ? BatchRenderMode::TransparencyDisabled :
   1661                                                               BatchRenderMode::TransparentAndOpaque;
   1662 }
   1663 
   1664 void GPU_HW::UpdateVRAMReadTexture(bool drawn, bool written)
   1665 {
   1666   GL_SCOPE("UpdateVRAMReadTexture()");
   1667 
   1668   const auto update = [this](GSVector4i& rect, u8 dbit) {
   1669     if (m_texpage_dirty & dbit)
   1670     {
   1671       m_texpage_dirty &= ~dbit;
   1672       if (!m_texpage_dirty)
   1673         GL_INS_FMT("{} texpage is no longer dirty", (dbit & TEXPAGE_DIRTY_DRAWN_RECT) ? "DRAW" : "WRITE");
   1674     }
   1675 
   1676     const GSVector4i scaled_rect = rect.mul32l(GSVector4i(m_resolution_scale));
   1677     if (m_vram_texture->IsMultisampled())
   1678     {
   1679       if (g_gpu_device->GetFeatures().partial_msaa_resolve)
   1680       {
   1681         g_gpu_device->ResolveTextureRegion(m_vram_read_texture.get(), scaled_rect.left, scaled_rect.top, 0, 0,
   1682                                            m_vram_texture.get(), scaled_rect.left, scaled_rect.top, scaled_rect.width(),
   1683                                            scaled_rect.height());
   1684       }
   1685       else
   1686       {
   1687         g_gpu_device->ResolveTextureRegion(m_vram_read_texture.get(), 0, 0, 0, 0, m_vram_texture.get(), 0, 0,
   1688                                            m_vram_texture->GetWidth(), m_vram_texture->GetHeight());
   1689       }
   1690     }
   1691     else
   1692     {
   1693       g_gpu_device->CopyTextureRegion(m_vram_read_texture.get(), scaled_rect.left, scaled_rect.top, 0, 0,
   1694                                       m_vram_texture.get(), scaled_rect.left, scaled_rect.top, 0, 0,
   1695                                       scaled_rect.width(), scaled_rect.height());
   1696     }
   1697 
   1698     // m_counters.num_read_texture_updates++;
   1699     rect = INVALID_RECT;
   1700   };
   1701 
   1702   if (drawn)
   1703   {
   1704     DebugAssert(!m_vram_dirty_draw_rect.eq(INVALID_RECT));
   1705     GL_INS_FMT("Updating draw rect {}", m_vram_dirty_draw_rect);
   1706 
   1707     u8 dbits = TEXPAGE_DIRTY_DRAWN_RECT;
   1708     if (written && m_vram_dirty_draw_rect.rintersects(m_vram_dirty_write_rect))
   1709     {
   1710       DebugAssert(!m_vram_dirty_write_rect.eq(INVALID_RECT));
   1711       GL_INS_FMT("Including write rect {}", m_vram_dirty_write_rect);
   1712       m_vram_dirty_draw_rect = m_vram_dirty_draw_rect.runion(m_vram_dirty_write_rect);
   1713       m_vram_dirty_write_rect = INVALID_RECT;
   1714       dbits = TEXPAGE_DIRTY_DRAWN_RECT | TEXPAGE_DIRTY_WRITTEN_RECT;
   1715       written = false;
   1716     }
   1717 
   1718     update(m_vram_dirty_draw_rect, dbits);
   1719   }
   1720   if (written)
   1721   {
   1722     GL_INS_FMT("Updating write rect {}", m_vram_dirty_write_rect);
   1723     update(m_vram_dirty_write_rect, TEXPAGE_DIRTY_WRITTEN_RECT);
   1724   }
   1725 }
   1726 
   1727 void GPU_HW::UpdateDepthBufferFromMaskBit()
   1728 {
   1729   DebugAssert(!m_pgxp_depth_buffer && m_vram_depth_texture && m_write_mask_as_depth);
   1730 
   1731   // Viewport should already be set full, only need to fudge the scissor.
   1732   g_gpu_device->SetScissor(m_vram_texture->GetRect());
   1733   g_gpu_device->InvalidateRenderTarget(m_vram_depth_texture.get());
   1734   g_gpu_device->SetRenderTargets(nullptr, 0, m_vram_depth_texture.get());
   1735   g_gpu_device->SetPipeline(m_vram_update_depth_pipeline.get());
   1736   g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler());
   1737   g_gpu_device->Draw(3, 0);
   1738 
   1739   // Restore.
   1740   g_gpu_device->SetTextureSampler(0, m_vram_read_texture.get(), g_gpu_device->GetNearestSampler());
   1741   SetVRAMRenderTarget();
   1742   SetScissor();
   1743 }
   1744 
   1745 void GPU_HW::CopyAndClearDepthBuffer()
   1746 {
   1747   if (!m_depth_was_copied)
   1748   {
   1749     // Take a copy of the current depth buffer so it can be used when the previous frame/buffer gets scanned out.
   1750     // Don't bother when we're not postprocessing, it'd just be a wasted copy.
   1751     if (PostProcessing::InternalChain.NeedsDepthBuffer())
   1752     {
   1753       // TODO: Shrink this to only the active area.
   1754       GL_SCOPE("Copy Depth Buffer");
   1755 
   1756       m_vram_texture->MakeReadyForSampling();
   1757       g_gpu_device->InvalidateRenderTarget(m_vram_depth_copy_texture.get());
   1758       g_gpu_device->SetRenderTarget(m_vram_depth_copy_texture.get());
   1759       g_gpu_device->SetViewportAndScissor(0, 0, m_vram_depth_texture->GetWidth(), m_vram_depth_texture->GetHeight());
   1760       g_gpu_device->SetTextureSampler(0, m_vram_depth_texture.get(), g_gpu_device->GetNearestSampler());
   1761       g_gpu_device->SetPipeline(m_copy_depth_pipeline.get());
   1762 
   1763       const float uniforms[4] = {0.0f, 0.0f, 1.0f, 1.0f};
   1764       g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
   1765       g_gpu_device->Draw(3, 0);
   1766       RestoreDeviceContext();
   1767     }
   1768 
   1769     m_depth_was_copied = true;
   1770   }
   1771 
   1772   ClearDepthBuffer();
   1773 }
   1774 
   1775 void GPU_HW::ClearDepthBuffer()
   1776 {
   1777   GL_SCOPE("GPU_HW::ClearDepthBuffer()");
   1778   DebugAssert(m_pgxp_depth_buffer);
   1779   if (m_use_rov_for_shader_blend)
   1780     g_gpu_device->ClearRenderTarget(m_vram_depth_texture.get(), 0xFF);
   1781   else
   1782     g_gpu_device->ClearDepth(m_vram_depth_texture.get(), 1.0f);
   1783   m_last_depth_z = 1.0f;
   1784 }
   1785 
   1786 void GPU_HW::SetScissor()
   1787 {
   1788   g_gpu_device->SetScissor(m_clamped_drawing_area.mul32l(GSVector4i(m_resolution_scale)));
   1789 }
   1790 
   1791 void GPU_HW::MapGPUBuffer(u32 required_vertices, u32 required_indices)
   1792 {
   1793   DebugAssert(!m_batch_vertex_ptr && !m_batch_index_ptr);
   1794 
   1795   void* vb_map;
   1796   u32 vb_space;
   1797   g_gpu_device->MapVertexBuffer(sizeof(BatchVertex), required_vertices, &vb_map, &vb_space, &m_batch_base_vertex);
   1798   m_batch_vertex_ptr = static_cast<BatchVertex*>(vb_map);
   1799   m_batch_vertex_space = Truncate16(std::min<u32>(vb_space, std::numeric_limits<u16>::max()));
   1800 
   1801   u32 ib_space;
   1802   g_gpu_device->MapIndexBuffer(required_indices, &m_batch_index_ptr, &ib_space, &m_batch_base_index);
   1803   m_batch_index_space = Truncate16(std::min<u32>(ib_space, std::numeric_limits<u16>::max()));
   1804 }
   1805 
   1806 void GPU_HW::UnmapGPUBuffer(u32 used_vertices, u32 used_indices)
   1807 {
   1808   DebugAssert(m_batch_vertex_ptr && m_batch_index_ptr);
   1809   g_gpu_device->UnmapVertexBuffer(sizeof(BatchVertex), used_vertices);
   1810   g_gpu_device->UnmapIndexBuffer(used_indices);
   1811   m_batch_vertex_ptr = nullptr;
   1812   m_batch_vertex_count = 0;
   1813   m_batch_vertex_space = 0;
   1814   m_batch_index_ptr = nullptr;
   1815   m_batch_index_count = 0;
   1816   m_batch_index_space = 0;
   1817 }
   1818 
   1819 ALWAYS_INLINE_RELEASE void GPU_HW::DrawBatchVertices(BatchRenderMode render_mode, u32 num_indices, u32 base_index,
   1820                                                      u32 base_vertex)
   1821 {
   1822   // [depth_test][transparency_mode][render_mode][texture_mode][dithering][interlacing][check_mask]
   1823   const u8 texture_mode = static_cast<u8>(m_batch.texture_mode) +
   1824                           ((m_batch.texture_mode != BatchTextureMode::Disabled && m_batch.sprite_mode) ?
   1825                              static_cast<u8>(BatchTextureMode::SpriteStart) :
   1826                              0);
   1827   const u8 depth_test = BoolToUInt8(m_batch.use_depth_buffer);
   1828   const u8 check_mask = BoolToUInt8(m_batch.check_mask_before_draw);
   1829   g_gpu_device->SetPipeline(m_batch_pipelines[depth_test][static_cast<u8>(m_batch.transparency_mode)][static_cast<u8>(
   1830     render_mode)][texture_mode][BoolToUInt8(m_batch.dithering)][BoolToUInt8(m_batch.interlacing)][check_mask]
   1831                               .get());
   1832 
   1833   GL_INS_FMT("Texture mode: {}", s_batch_texture_modes[texture_mode]);
   1834   GL_INS_FMT("Transparency mode: {}", s_transparency_modes[static_cast<u8>(m_batch.transparency_mode)]);
   1835   GL_INS_FMT("Render mode: {}", s_batch_render_modes[static_cast<u8>(render_mode)]);
   1836   GL_INS_FMT("Mask bit test: {}", m_batch.check_mask_before_draw);
   1837   GL_INS_FMT("Interlacing: {}", m_batch.check_mask_before_draw);
   1838 
   1839   // Activating ROV?
   1840   if (render_mode == BatchRenderMode::ShaderBlend)
   1841   {
   1842     if (m_use_rov_for_shader_blend)
   1843     {
   1844       if (!m_rov_active)
   1845       {
   1846         GL_INS("Activating ROV.");
   1847         m_rov_active = true;
   1848         SetVRAMRenderTarget();
   1849       }
   1850 
   1851       g_gpu_device->DrawIndexed(num_indices, base_index, base_vertex);
   1852     }
   1853     else if (m_supports_framebuffer_fetch)
   1854     {
   1855       // No barriers needed for FBFetch.
   1856       g_gpu_device->DrawIndexed(num_indices, base_index, base_vertex);
   1857     }
   1858     else
   1859     {
   1860       // Barriers. Yucky.
   1861       g_gpu_device->DrawIndexedWithBarrier(num_indices, base_index, base_vertex, GPUDevice::DrawBarrier::Full);
   1862     }
   1863   }
   1864   else
   1865   {
   1866     g_gpu_device->DrawIndexed(num_indices, base_index, base_vertex);
   1867   }
   1868 }
   1869 
   1870 ALWAYS_INLINE_RELEASE void GPU_HW::HandleFlippedQuadTextureCoordinates(BatchVertex* vertices)
   1871 {
   1872   // Taken from beetle-psx gpu_polygon.cpp
   1873   // For X/Y flipped 2D sprites, PSX games rely on a very specific rasterization behavior. If U or V is decreasing in X
   1874   // or Y, and we use the provided U/V as is, we will sample the wrong texel as interpolation covers an entire pixel,
   1875   // while PSX samples its interpolation essentially in the top-left corner and splats that interpolant across the
   1876   // entire pixel. While we could emulate this reasonably well in native resolution by shifting our vertex coords by
   1877   // 0.5, this breaks in upscaling scenarios, because we have several samples per native sample and we need NN rules to
   1878   // hit the same UV every time. One approach here is to use interpolate at offset or similar tricks to generalize the
   1879   // PSX interpolation patterns, but the problem is that vertices sharing an edge will no longer see the same UV (due to
   1880   // different plane derivatives), we end up sampling outside the intended boundary and artifacts are inevitable, so the
   1881   // only case where we can apply this fixup is for "sprites" or similar which should not share edges, which leads to
   1882   // this unfortunate code below.
   1883 
   1884   // It might be faster to do more direct checking here, but the code below handles primitives in any order and
   1885   // orientation, and is far more SIMD-friendly if needed.
   1886   const float abx = vertices[1].x - vertices[0].x;
   1887   const float aby = vertices[1].y - vertices[0].y;
   1888   const float bcx = vertices[2].x - vertices[1].x;
   1889   const float bcy = vertices[2].y - vertices[1].y;
   1890   const float cax = vertices[0].x - vertices[2].x;
   1891   const float cay = vertices[0].y - vertices[2].y;
   1892 
   1893   // Hack for Wild Arms 2: The player sprite is drawn one line at a time with a quad, but the bottom V coordinates
   1894   // are set to a large distance from the top V coordinate. When upscaling, this means that the coordinate is
   1895   // interpolated between these two values, result in out-of-bounds sampling. At native, it's fine, because at the
   1896   // top of the primitive, no amount is added to the coordinates. So, in this case, just set all coordinates to the
   1897   // same value, from the first vertex, ensuring no interpolation occurs. Gate it based on the Y distance being one
   1898   // pixel, limiting the risk of false positives.
   1899   if (m_line_detect_mode == GPULineDetectMode::Quads &&
   1900       (std::max(vertices[0].y, std::max(vertices[1].y, std::max(vertices[2].y, vertices[3].y))) -
   1901        std::min(vertices[0].y, std::min(vertices[1].y, std::min(vertices[2].y, vertices[3].y)))) == 1.0f) [[unlikely]]
   1902   {
   1903     GL_INS_FMT("HLineQuad detected at [{},{}={},{} {},{}={},{} {},{}={},{} {},{}={},{}", vertices[0].x, vertices[0].y,
   1904                vertices[0].u, vertices[0].v, vertices[1].x, vertices[1].y, vertices[1].u, vertices[1].v, vertices[2].x,
   1905                vertices[2].y, vertices[2].u, vertices[2].v, vertices[3].x, vertices[3].y, vertices[3].u, vertices[3].v);
   1906     vertices[1].v = vertices[0].v;
   1907     vertices[2].v = vertices[0].v;
   1908     vertices[3].v = vertices[0].v;
   1909   }
   1910 
   1911   // Compute static derivatives, just assume W is uniform across the primitive and that the plane equation remains the
   1912   // same across the quad. (which it is, there is no Z.. yet).
   1913   const float dudx = -aby * static_cast<float>(vertices[2].u) - bcy * static_cast<float>(vertices[0].u) -
   1914                      cay * static_cast<float>(vertices[1].u);
   1915   const float dvdx = -aby * static_cast<float>(vertices[2].v) - bcy * static_cast<float>(vertices[0].v) -
   1916                      cay * static_cast<float>(vertices[1].v);
   1917   const float dudy = +abx * static_cast<float>(vertices[2].u) + bcx * static_cast<float>(vertices[0].u) +
   1918                      cax * static_cast<float>(vertices[1].u);
   1919   const float dvdy = +abx * static_cast<float>(vertices[2].v) + bcx * static_cast<float>(vertices[0].v) +
   1920                      cax * static_cast<float>(vertices[1].v);
   1921   const float area = bcx * cay - bcy * cax;
   1922 
   1923   // Detect and reject any triangles with 0 size texture area
   1924   const s32 texArea = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) -
   1925                       (vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v);
   1926 
   1927   // Shouldn't matter as degenerate primitives will be culled anyways.
   1928   if (area == 0.0f || texArea == 0)
   1929     return;
   1930 
   1931   // Use floats here as it'll be faster than integer divides.
   1932   const float rcp_area = 1.0f / area;
   1933   const float dudx_area = dudx * rcp_area;
   1934   const float dudy_area = dudy * rcp_area;
   1935   const float dvdx_area = dvdx * rcp_area;
   1936   const float dvdy_area = dvdy * rcp_area;
   1937   const bool neg_dudx = dudx_area < 0.0f;
   1938   const bool neg_dudy = dudy_area < 0.0f;
   1939   const bool neg_dvdx = dvdx_area < 0.0f;
   1940   const bool neg_dvdy = dvdy_area < 0.0f;
   1941   const bool zero_dudx = dudx_area == 0.0f;
   1942   const bool zero_dudy = dudy_area == 0.0f;
   1943   const bool zero_dvdx = dvdx_area == 0.0f;
   1944   const bool zero_dvdy = dvdy_area == 0.0f;
   1945 
   1946   // If we have negative dU or dV in any direction, increment the U or V to work properly with nearest-neighbor in
   1947   // this impl. If we don't have 1:1 pixel correspondence, this creates a slight "shift" in the sprite, but we
   1948   // guarantee that we don't sample garbage at least. Overall, this is kinda hacky because there can be legitimate,
   1949   // rare cases where 3D meshes hit this scenario, and a single texel offset can pop in, but this is way better than
   1950   // having borked 2D overall.
   1951   //
   1952   // TODO: If perf becomes an issue, we can probably SIMD the 8 comparisons above,
   1953   // create an 8-bit code, and use a LUT to get the offsets.
   1954   // Case 1: U is decreasing in X, but no change in Y.
   1955   // Case 2: U is decreasing in Y, but no change in X.
   1956   // Case 3: V is decreasing in X, but no change in Y.
   1957   // Case 4: V is decreasing in Y, but no change in X.
   1958   if ((neg_dudx && zero_dudy) || (neg_dudy && zero_dudx))
   1959   {
   1960     vertices[0].u++;
   1961     vertices[1].u++;
   1962     vertices[2].u++;
   1963     vertices[3].u++;
   1964   }
   1965 
   1966   if ((neg_dvdx && zero_dvdy) || (neg_dvdy && zero_dvdx))
   1967   {
   1968     vertices[0].v++;
   1969     vertices[1].v++;
   1970     vertices[2].v++;
   1971     vertices[3].v++;
   1972   }
   1973 
   1974   // 2D polygons should have zero change in V on the X axis, and vice versa.
   1975   if (m_allow_sprite_mode)
   1976     SetBatchSpriteMode(zero_dudy && zero_dvdx);
   1977 }
   1978 
   1979 bool GPU_HW::IsPossibleSpritePolygon(const BatchVertex* vertices) const
   1980 {
   1981   const float abx = vertices[1].x - vertices[0].x;
   1982   const float aby = vertices[1].y - vertices[0].y;
   1983   const float bcx = vertices[2].x - vertices[1].x;
   1984   const float bcy = vertices[2].y - vertices[1].y;
   1985   const float cax = vertices[0].x - vertices[2].x;
   1986   const float cay = vertices[0].y - vertices[2].y;
   1987   const float dvdx = -aby * static_cast<float>(vertices[2].v) - bcy * static_cast<float>(vertices[0].v) -
   1988                      cay * static_cast<float>(vertices[1].v);
   1989   const float dudy = +abx * static_cast<float>(vertices[2].u) + bcx * static_cast<float>(vertices[0].u) +
   1990                      cax * static_cast<float>(vertices[1].u);
   1991   const float area = bcx * cay - bcy * cax;
   1992   const s32 texArea = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) -
   1993                       (vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v);
   1994 
   1995   // Doesn't matter.
   1996   if (area == 0.0f || texArea == 0)
   1997     return m_batch.sprite_mode;
   1998 
   1999   const float rcp_area = 1.0f / area;
   2000   const bool zero_dudy = ((dudy * rcp_area) == 0.0f);
   2001   const bool zero_dvdx = ((dvdx * rcp_area) == 0.0f);
   2002   return (zero_dudy && zero_dvdx);
   2003 }
   2004 
   2005 ALWAYS_INLINE_RELEASE bool GPU_HW::ExpandLineTriangles(BatchVertex* vertices)
   2006 {
   2007   // Line expansion inspired by beetle-psx.
   2008   BatchVertex *vshort, *vlong;
   2009   bool vertical, horizontal;
   2010 
   2011   if (m_line_detect_mode == GPULineDetectMode::BasicTriangles)
   2012   {
   2013     // Given a tall/one-pixel-wide triangle, determine which vertex is the corner with axis-aligned edges.
   2014     BatchVertex* vcorner;
   2015     if (vertices[0].u == vertices[1].u && vertices[0].v == vertices[1].v)
   2016     {
   2017       // A,B,C
   2018       vcorner = &vertices[0];
   2019       vshort = &vertices[1];
   2020       vlong = &vertices[2];
   2021     }
   2022     else if (vertices[1].u == vertices[2].u && vertices[1].v == vertices[2].v)
   2023     {
   2024       // B,C,A
   2025       vcorner = &vertices[1];
   2026       vshort = &vertices[2];
   2027       vlong = &vertices[0];
   2028     }
   2029     else if (vertices[2].u == vertices[0].u && vertices[2].v == vertices[0].v)
   2030     {
   2031       // C,A,B
   2032       vcorner = &vertices[2];
   2033       vshort = &vertices[0];
   2034       vlong = &vertices[1];
   2035     }
   2036     else
   2037     {
   2038       return false;
   2039     }
   2040 
   2041     // Determine line direction. Vertical lines will have a width of 1, horizontal lines a height of 1.
   2042     vertical = ((vcorner->y == vshort->y) && (std::abs(vcorner->x - vshort->x) == 1.0f));
   2043     horizontal = ((vcorner->x == vshort->x) && (std::abs(vcorner->y - vshort->y) == 1.0f));
   2044     if (vertical)
   2045     {
   2046       // Line should be vertical. Make sure the triangle is actually a right angle.
   2047       if (vshort->x == vlong->x)
   2048         std::swap(vshort, vcorner);
   2049       else if (vcorner->x != vlong->x)
   2050         return false;
   2051 
   2052       GL_INS_FMT("Vertical line from Y={} to {}", vcorner->y, vlong->y);
   2053     }
   2054     else if (horizontal)
   2055     {
   2056       // Line should be horizontal. Make sure the triangle is actually a right angle.
   2057       if (vshort->y == vlong->y)
   2058         std::swap(vshort, vcorner);
   2059       else if (vcorner->y != vlong->y)
   2060         return false;
   2061 
   2062       GL_INS_FMT("Horizontal line from X={} to {}", vcorner->x, vlong->x);
   2063     }
   2064     else
   2065     {
   2066       // Not a line-like triangle.
   2067       return false;
   2068     }
   2069 
   2070     // We could adjust the short texture coordinate to +1 from its original position, rather than leaving it the same.
   2071     // However, since the texture is unlikely to be a higher resolution than the one-wide triangle, there would be no
   2072     // benefit in doing so.
   2073   }
   2074   else
   2075   {
   2076     DebugAssert(m_line_detect_mode == GPULineDetectMode::AggressiveTriangles);
   2077 
   2078     // Find direction of line based on horizontal position.
   2079     BatchVertex *va, *vb, *vc;
   2080     if (vertices[0].x == vertices[1].x)
   2081     {
   2082       va = &vertices[0];
   2083       vb = &vertices[1];
   2084       vc = &vertices[2];
   2085     }
   2086     else if (vertices[1].x == vertices[2].x)
   2087     {
   2088       va = &vertices[1];
   2089       vb = &vertices[2];
   2090       vc = &vertices[0];
   2091     }
   2092     else if (vertices[2].x == vertices[0].x)
   2093     {
   2094       va = &vertices[2];
   2095       vb = &vertices[0];
   2096       vc = &vertices[1];
   2097     }
   2098     else
   2099     {
   2100       return false;
   2101     }
   2102 
   2103     // Determine line direction. Vertical lines will have a width of 1, horizontal lines a height of 1.
   2104     vertical = (std::abs(va->x - vc->x) == 1.0f);
   2105     horizontal = (std::abs(va->y - vb->y) == 1.0f);
   2106     if (!vertical && !horizontal)
   2107       return false;
   2108 
   2109     // Determine which vertex is the right angle, based on the vertical position.
   2110     const BatchVertex* vcorner;
   2111     if (va->y == vc->y)
   2112       vcorner = va;
   2113     else if (vb->y == vc->y)
   2114       vcorner = vb;
   2115     else
   2116       return false;
   2117 
   2118     // Find short/long edge of the triangle.
   2119     BatchVertex* vother = ((vcorner == va) ? vb : va);
   2120     vshort = horizontal ? vother : vc;
   2121     vlong = vertical ? vother : vc;
   2122 
   2123     // Dark Forces draws its gun sprite vertically, but rotated compared to the sprite date in VRAM.
   2124     // Therefore the difference in V should be ignored.
   2125     vshort->u = vcorner->u;
   2126     vshort->v = vcorner->v;
   2127   }
   2128 
   2129   // Need to write the 4th vertex.
   2130   DebugAssert(m_batch_vertex_space >= 1);
   2131   BatchVertex* last = &(vertices[3] = *vlong);
   2132   last->x = vertical ? vshort->x : vlong->x;
   2133   last->y = horizontal ? vshort->y : vlong->y;
   2134 
   2135   // Generate indices.
   2136   const u32 base_vertex = m_batch_vertex_count;
   2137   DebugAssert(m_batch_index_space >= 6);
   2138   *(m_batch_index_ptr++) = Truncate16(base_vertex);
   2139   *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
   2140   *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
   2141   *(m_batch_index_ptr++) = Truncate16(base_vertex + (vshort - vertices));
   2142   *(m_batch_index_ptr++) = Truncate16(base_vertex + (vlong - vertices));
   2143   *(m_batch_index_ptr++) = Truncate16(base_vertex + 3);
   2144   m_batch_index_count += 6;
   2145   m_batch_index_space -= 6;
   2146 
   2147   // Upload vertices.
   2148   DebugAssert(m_batch_vertex_space >= 4);
   2149   std::memcpy(m_batch_vertex_ptr, vertices, sizeof(BatchVertex) * 4);
   2150   m_batch_vertex_ptr += 4;
   2151   m_batch_vertex_count += 4;
   2152   m_batch_vertex_space -= 4;
   2153   return true;
   2154 }
   2155 
   2156 void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices)
   2157 {
   2158   DebugAssert(num_vertices == 3 || num_vertices == 4);
   2159 
   2160   GSVector2i v0 = GSVector2i::load32(&vertices[0].u);
   2161   GSVector2i v1 = GSVector2i::load32(&vertices[1].u);
   2162   GSVector2i v2 = GSVector2i::load32(&vertices[2].u);
   2163   GSVector2i v3;
   2164   GSVector2i min = v0.min_u16(v1).min_u16(v2);
   2165   GSVector2i max = v0.max_u16(v1).max_u16(v2);
   2166   if (num_vertices == 4)
   2167   {
   2168     v3 = GSVector2i::load32(&vertices[3].u);
   2169     min = min.min_u16(v3);
   2170     max = max.max_u16(v3);
   2171   }
   2172 
   2173   u32 min_u = min.extract16<0>();
   2174   u32 min_v = min.extract16<1>();
   2175   u32 max_u = max.extract16<0>();
   2176   u32 max_v = max.extract16<1>();
   2177   max_u = (min_u != max_u) ? (max_u - 1) : max_u;
   2178   max_v = (min_v != max_v) ? (max_v - 1) : max_v;
   2179 
   2180   for (u32 i = 0; i < num_vertices; i++)
   2181     vertices[i].SetUVLimits(min_u, max_u, min_v, max_v);
   2182 
   2183   if (m_texpage_dirty != 0)
   2184     CheckForTexPageOverlap(GSVector4i(min).upl32(GSVector4i(max)).u16to32());
   2185 }
   2186 
   2187 void GPU_HW::SetBatchDepthBuffer(bool enabled)
   2188 {
   2189   if (m_batch.use_depth_buffer == enabled)
   2190     return;
   2191 
   2192   if (m_batch_index_count > 0)
   2193   {
   2194     FlushRender();
   2195     EnsureVertexBufferSpaceForCurrentCommand();
   2196   }
   2197 
   2198   m_batch.use_depth_buffer = enabled;
   2199 }
   2200 
   2201 void GPU_HW::CheckForDepthClear(const BatchVertex* vertices, u32 num_vertices)
   2202 {
   2203   DebugAssert(num_vertices == 3 || num_vertices == 4);
   2204   float average_z;
   2205   if (num_vertices == 3)
   2206     average_z = std::min((vertices[0].w + vertices[1].w + vertices[2].w) / 3.0f, 1.0f);
   2207   else
   2208     average_z = std::min((vertices[0].w + vertices[1].w + vertices[2].w + vertices[3].w) / 4.0f, 1.0f);
   2209 
   2210   if ((average_z - m_last_depth_z) >= g_settings.gpu_pgxp_depth_clear_threshold)
   2211   {
   2212     FlushRender();
   2213     CopyAndClearDepthBuffer();
   2214     EnsureVertexBufferSpaceForCurrentCommand();
   2215   }
   2216 
   2217   m_last_depth_z = average_z;
   2218 }
   2219 
   2220 void GPU_HW::SetBatchSpriteMode(bool enabled)
   2221 {
   2222   if (m_batch.sprite_mode == enabled)
   2223     return;
   2224 
   2225   if (m_batch_index_count > 0)
   2226   {
   2227     FlushRender();
   2228     EnsureVertexBufferSpaceForCurrentCommand();
   2229   }
   2230 
   2231   GL_INS_FMT("Sprite mode is now {}", enabled ? "ON" : "OFF");
   2232 
   2233   m_batch.sprite_mode = enabled;
   2234 }
   2235 
   2236 void GPU_HW::DrawLine(const GSVector4 bounds, u32 col0, u32 col1, float depth)
   2237 {
   2238   DebugAssert(m_batch_vertex_space >= 4 && m_batch_index_space >= 6);
   2239 
   2240   const float x0 = bounds.x;
   2241   const float y0 = bounds.y;
   2242   const float x1 = bounds.z;
   2243   const float y1 = bounds.w;
   2244 
   2245   const float dx = x1 - x0;
   2246   const float dy = y1 - y0;
   2247   if (dx == 0.0f && dy == 0.0f)
   2248   {
   2249     // Degenerate, render a point.
   2250     (m_batch_vertex_ptr++)->Set(x0, y0, depth, 1.0f, col0, 0, 0, 0);
   2251     (m_batch_vertex_ptr++)->Set(x0 + 1.0f, y0, depth, 1.0f, col0, 0, 0, 0);
   2252     (m_batch_vertex_ptr++)->Set(x1, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0);
   2253     (m_batch_vertex_ptr++)->Set(x1 + 1.0f, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0);
   2254   }
   2255   else
   2256   {
   2257     const float abs_dx = std::fabs(dx);
   2258     const float abs_dy = std::fabs(dy);
   2259     float fill_dx, fill_dy;
   2260     float pad_x0 = 0.0f;
   2261     float pad_x1 = 0.0f;
   2262     float pad_y0 = 0.0f;
   2263     float pad_y1 = 0.0f;
   2264 
   2265     // Check for vertical or horizontal major lines.
   2266     // When expanding to a rect, do so in the appropriate direction.
   2267     // FIXME: This scheme seems to kinda work, but it seems very hard to find a method
   2268     // that looks perfect on every game.
   2269     // Vagrant Story speech bubbles are a very good test case here!
   2270     if (abs_dx > abs_dy)
   2271     {
   2272       fill_dx = 0.0f;
   2273       fill_dy = 1.0f;
   2274       const float dydk = dy / abs_dx;
   2275 
   2276       if (dx > 0.0f)
   2277       {
   2278         // Right
   2279         pad_x1 = 1.0f;
   2280         pad_y1 = dydk;
   2281       }
   2282       else
   2283       {
   2284         // Left
   2285         pad_x0 = 1.0f;
   2286         pad_y0 = -dydk;
   2287       }
   2288     }
   2289     else
   2290     {
   2291       fill_dx = 1.0f;
   2292       fill_dy = 0.0f;
   2293       const float dxdk = dx / abs_dy;
   2294 
   2295       if (dy > 0.0f)
   2296       {
   2297         // Down
   2298         pad_y1 = 1.0f;
   2299         pad_x1 = dxdk;
   2300       }
   2301       else
   2302       {
   2303         // Up
   2304         pad_y0 = 1.0f;
   2305         pad_x0 = -dxdk;
   2306       }
   2307     }
   2308 
   2309     const float ox0 = x0 + pad_x0;
   2310     const float oy0 = y0 + pad_y0;
   2311     const float ox1 = x1 + pad_x1;
   2312     const float oy1 = y1 + pad_y1;
   2313 
   2314     (m_batch_vertex_ptr++)->Set(ox0, oy0, depth, 1.0f, col0, 0, 0, 0);
   2315     (m_batch_vertex_ptr++)->Set(ox0 + fill_dx, oy0 + fill_dy, depth, 1.0f, col0, 0, 0, 0);
   2316     (m_batch_vertex_ptr++)->Set(ox1, oy1, depth, 1.0f, col1, 0, 0, 0);
   2317     (m_batch_vertex_ptr++)->Set(ox1 + fill_dx, oy1 + fill_dy, depth, 1.0f, col1, 0, 0, 0);
   2318   }
   2319 
   2320   const u32 start_index = m_batch_vertex_count;
   2321   m_batch_vertex_count += 4;
   2322   m_batch_vertex_space -= 4;
   2323 
   2324   *(m_batch_index_ptr++) = Truncate16(start_index + 0);
   2325   *(m_batch_index_ptr++) = Truncate16(start_index + 1);
   2326   *(m_batch_index_ptr++) = Truncate16(start_index + 2);
   2327   *(m_batch_index_ptr++) = Truncate16(start_index + 3);
   2328   *(m_batch_index_ptr++) = Truncate16(start_index + 2);
   2329   *(m_batch_index_ptr++) = Truncate16(start_index + 1);
   2330   m_batch_index_count += 6;
   2331   m_batch_index_space -= 6;
   2332 }
   2333 
   2334 void GPU_HW::LoadVertices()
   2335 {
   2336   if (m_GPUSTAT.check_mask_before_draw)
   2337     m_current_depth++;
   2338 
   2339   const GPURenderCommand rc{m_render_command.bits};
   2340   const u32 texpage = ZeroExtend32(m_draw_mode.mode_reg.bits) | (ZeroExtend32(m_draw_mode.palette_reg.bits) << 16);
   2341   const float depth = GetCurrentNormalizedVertexDepth();
   2342 
   2343   switch (rc.primitive)
   2344   {
   2345     case GPUPrimitive::Polygon:
   2346     {
   2347       const bool textured = rc.texture_enable;
   2348       const bool raw_texture = textured && rc.raw_texture_enable;
   2349       const bool shaded = rc.shading_enable;
   2350       const bool pgxp = g_settings.gpu_pgxp_enable;
   2351 
   2352       const u32 first_color = rc.color_for_first_vertex;
   2353       u32 num_vertices = rc.quad_polygon ? 4 : 3;
   2354       std::array<BatchVertex, 4> vertices;
   2355       std::array<GSVector2i, 4> native_vertex_positions;
   2356       std::array<u16, 4> native_texcoords;
   2357       bool valid_w = g_settings.gpu_pgxp_texture_correction;
   2358       for (u32 i = 0; i < num_vertices; i++)
   2359       {
   2360         const u32 vert_color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
   2361         const u32 color = raw_texture ? UINT32_C(0x00808080) : vert_color;
   2362         const u64 maddr_and_pos = m_fifo.Pop();
   2363         const GPUVertexPosition vp{Truncate32(maddr_and_pos)};
   2364         const u16 texcoord = textured ? Truncate16(FifoPop()) : 0;
   2365         const s32 native_x = native_vertex_positions[i].x = m_drawing_offset.x + vp.x;
   2366         const s32 native_y = native_vertex_positions[i].y = m_drawing_offset.y + vp.y;
   2367         native_texcoords[i] = texcoord;
   2368         vertices[i].Set(static_cast<float>(native_x), static_cast<float>(native_y), depth, 1.0f, color, texpage,
   2369                         texcoord, 0xFFFF0000u);
   2370 
   2371         if (pgxp)
   2372         {
   2373           valid_w &= CPU::PGXP::GetPreciseVertex(Truncate32(maddr_and_pos >> 32), vp.bits, native_x, native_y,
   2374                                                  m_drawing_offset.x, m_drawing_offset.y, &vertices[i].x, &vertices[i].y,
   2375                                                  &vertices[i].w);
   2376         }
   2377       }
   2378       if (pgxp)
   2379       {
   2380         if (!valid_w)
   2381         {
   2382           SetBatchDepthBuffer(false);
   2383           if (g_settings.gpu_pgxp_disable_2d)
   2384           {
   2385             // NOTE: This reads uninitialized data, but it's okay, it doesn't get used.
   2386             for (size_t i = 0; i < vertices.size(); i++)
   2387             {
   2388               BatchVertex& v = vertices[i];
   2389               v.x = static_cast<float>(native_vertex_positions[i].x);
   2390               v.y = static_cast<float>(native_vertex_positions[i].y);
   2391               v.w = 1.0f;
   2392             }
   2393           }
   2394           else
   2395           {
   2396             for (BatchVertex& v : vertices)
   2397               v.w = 1.0f;
   2398           }
   2399         }
   2400         else if (m_pgxp_depth_buffer)
   2401         {
   2402           SetBatchDepthBuffer(true);
   2403           CheckForDepthClear(vertices.data(), num_vertices);
   2404         }
   2405       }
   2406 
   2407       // Use PGXP to exclude primitives that are definitely 3D.
   2408       const bool is_3d = (vertices[0].w != vertices[1].w || vertices[0].w != vertices[2].w);
   2409       if (m_resolution_scale > 1 && !is_3d && rc.quad_polygon)
   2410         HandleFlippedQuadTextureCoordinates(vertices.data());
   2411       else if (m_allow_sprite_mode)
   2412         SetBatchSpriteMode((pgxp && !is_3d) || IsPossibleSpritePolygon(vertices.data()));
   2413 
   2414       if (m_sw_renderer)
   2415       {
   2416         GPUBackendDrawPolygonCommand* cmd = m_sw_renderer->NewDrawPolygonCommand(num_vertices);
   2417         FillDrawCommand(cmd, rc);
   2418 
   2419         const u32 sw_num_vertices = rc.quad_polygon ? 4 : 3;
   2420         for (u32 i = 0; i < sw_num_vertices; i++)
   2421         {
   2422           GPUBackendDrawPolygonCommand::Vertex* vert = &cmd->vertices[i];
   2423           vert->x = native_vertex_positions[i].x;
   2424           vert->y = native_vertex_positions[i].y;
   2425           vert->texcoord = native_texcoords[i];
   2426           vert->color = vertices[i].color;
   2427         }
   2428 
   2429         m_sw_renderer->PushCommand(cmd);
   2430       }
   2431 
   2432       // Cull polygons which are too large.
   2433       const GSVector2 v0f = GSVector2::load(&vertices[0].x);
   2434       const GSVector2 v1f = GSVector2::load(&vertices[1].x);
   2435       const GSVector2 v2f = GSVector2::load(&vertices[2].x);
   2436       const GSVector2 min_pos_12 = v1f.min(v2f);
   2437       const GSVector2 max_pos_12 = v1f.max(v2f);
   2438       const GSVector4i draw_rect_012 = GSVector4i(GSVector4(min_pos_12.min(v0f)).upld(GSVector4(max_pos_12.max(v0f))))
   2439                                          .add32(GSVector4i::cxpr(0, 0, 1, 1));
   2440       const GSVector4i clamped_draw_rect_012 = draw_rect_012.rintersect(m_clamped_drawing_area);
   2441       const bool first_tri_culled = (draw_rect_012.width() > MAX_PRIMITIVE_WIDTH ||
   2442                                      draw_rect_012.height() > MAX_PRIMITIVE_HEIGHT || clamped_draw_rect_012.rempty());
   2443       if (first_tri_culled)
   2444       {
   2445         GL_INS_FMT("Culling off-screen/too-large polygon: {},{} {},{} {},{}", native_vertex_positions[0].x,
   2446                    native_vertex_positions[0].y, native_vertex_positions[1].x, native_vertex_positions[1].y,
   2447                    native_vertex_positions[2].x, native_vertex_positions[2].y);
   2448 
   2449         if (!rc.quad_polygon)
   2450           return;
   2451       }
   2452       else
   2453       {
   2454         if (textured && m_compute_uv_range)
   2455           ComputePolygonUVLimits(vertices.data(), num_vertices);
   2456 
   2457         AddDrawnRectangle(clamped_draw_rect_012);
   2458         AddDrawTriangleTicks(native_vertex_positions[0], native_vertex_positions[1], native_vertex_positions[2],
   2459                              rc.shading_enable, rc.texture_enable, rc.transparency_enable);
   2460 
   2461         // Expand lines to triangles (Doom, Soul Blade, etc.)
   2462         if (!rc.quad_polygon && m_line_detect_mode >= GPULineDetectMode::BasicTriangles && !is_3d &&
   2463             ExpandLineTriangles(vertices.data()))
   2464         {
   2465           return;
   2466         }
   2467 
   2468         const u32 start_index = m_batch_vertex_count;
   2469         DebugAssert(m_batch_index_space >= 3);
   2470         *(m_batch_index_ptr++) = Truncate16(start_index);
   2471         *(m_batch_index_ptr++) = Truncate16(start_index + 1);
   2472         *(m_batch_index_ptr++) = Truncate16(start_index + 2);
   2473         m_batch_index_count += 3;
   2474         m_batch_index_space -= 3;
   2475       }
   2476 
   2477       // quads
   2478       if (rc.quad_polygon)
   2479       {
   2480         const GSVector2 v3f = GSVector2::load(&vertices[3].x);
   2481         const GSVector4i draw_rect_123 = GSVector4i(GSVector4(min_pos_12.min(v3f)).upld(GSVector4(max_pos_12.max(v3f))))
   2482                                            .add32(GSVector4i::cxpr(0, 0, 1, 1));
   2483         const GSVector4i clamped_draw_rect_123 = draw_rect_123.rintersect(m_clamped_drawing_area);
   2484 
   2485         // Cull polygons which are too large.
   2486         const bool second_tri_culled =
   2487           (draw_rect_123.width() > MAX_PRIMITIVE_WIDTH || draw_rect_123.height() > MAX_PRIMITIVE_HEIGHT ||
   2488            clamped_draw_rect_123.rempty());
   2489         if (second_tri_culled)
   2490         {
   2491           GL_INS_FMT("Culling off-screen/too-large polygon (quad second half): {},{} {},{} {},{}",
   2492                      native_vertex_positions[2].x, native_vertex_positions[2].y, native_vertex_positions[1].x,
   2493                      native_vertex_positions[1].y, native_vertex_positions[0].x, native_vertex_positions[0].y);
   2494 
   2495           if (first_tri_culled)
   2496             return;
   2497         }
   2498         else
   2499         {
   2500           if (first_tri_culled && textured && m_compute_uv_range)
   2501             ComputePolygonUVLimits(vertices.data(), num_vertices);
   2502 
   2503           AddDrawnRectangle(clamped_draw_rect_123);
   2504           AddDrawTriangleTicks(native_vertex_positions[2], native_vertex_positions[1], native_vertex_positions[3],
   2505                                rc.shading_enable, rc.texture_enable, rc.transparency_enable);
   2506 
   2507           const u32 start_index = m_batch_vertex_count;
   2508           DebugAssert(m_batch_index_space >= 3);
   2509           *(m_batch_index_ptr++) = Truncate16(start_index + 2);
   2510           *(m_batch_index_ptr++) = Truncate16(start_index + 1);
   2511           *(m_batch_index_ptr++) = Truncate16(start_index + 3);
   2512           m_batch_index_count += 3;
   2513           m_batch_index_space -= 3;
   2514         }
   2515       }
   2516 
   2517       if (num_vertices == 4)
   2518       {
   2519         DebugAssert(m_batch_vertex_space >= 4);
   2520         std::memcpy(m_batch_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 4);
   2521         m_batch_vertex_ptr += 4;
   2522         m_batch_vertex_count += 4;
   2523         m_batch_vertex_space -= 4;
   2524       }
   2525       else
   2526       {
   2527         DebugAssert(m_batch_vertex_space >= 3);
   2528         std::memcpy(m_batch_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 3);
   2529         m_batch_vertex_ptr += 3;
   2530         m_batch_vertex_count += 3;
   2531         m_batch_vertex_space -= 3;
   2532       }
   2533     }
   2534     break;
   2535 
   2536     case GPUPrimitive::Rectangle:
   2537     {
   2538       const u32 color = (rc.texture_enable && rc.raw_texture_enable) ? UINT32_C(0x00808080) : rc.color_for_first_vertex;
   2539       const GPUVertexPosition vp{FifoPop()};
   2540       const s32 pos_x = TruncateGPUVertexPosition(m_drawing_offset.x + vp.x);
   2541       const s32 pos_y = TruncateGPUVertexPosition(m_drawing_offset.y + vp.y);
   2542 
   2543       const auto [texcoord_x, texcoord_y] = UnpackTexcoord(rc.texture_enable ? Truncate16(FifoPop()) : 0);
   2544       u32 orig_tex_left = ZeroExtend16(texcoord_x);
   2545       u32 orig_tex_top = ZeroExtend16(texcoord_y);
   2546       u32 rectangle_width;
   2547       u32 rectangle_height;
   2548       switch (rc.rectangle_size)
   2549       {
   2550         case GPUDrawRectangleSize::R1x1:
   2551           rectangle_width = 1;
   2552           rectangle_height = 1;
   2553           break;
   2554         case GPUDrawRectangleSize::R8x8:
   2555           rectangle_width = 8;
   2556           rectangle_height = 8;
   2557           break;
   2558         case GPUDrawRectangleSize::R16x16:
   2559           rectangle_width = 16;
   2560           rectangle_height = 16;
   2561           break;
   2562         default:
   2563         {
   2564           const u32 width_and_height = FifoPop();
   2565           rectangle_width = (width_and_height & VRAM_WIDTH_MASK);
   2566           rectangle_height = ((width_and_height >> 16) & VRAM_HEIGHT_MASK);
   2567         }
   2568         break;
   2569       }
   2570 
   2571       const GSVector4i rect =
   2572         GSVector4i(pos_x, pos_y, pos_x + static_cast<s32>(rectangle_width), pos_y + static_cast<s32>(rectangle_height));
   2573       const GSVector4i clamped_rect = m_clamped_drawing_area.rintersect(rect);
   2574       if (clamped_rect.rempty()) [[unlikely]]
   2575       {
   2576         GL_INS_FMT("Culling off-screen rectangle {}", rect);
   2577         return;
   2578       }
   2579 
   2580       // we can split the rectangle up into potentially 8 quads
   2581       SetBatchDepthBuffer(false);
   2582       SetBatchSpriteMode(m_allow_sprite_mode);
   2583       DebugAssert(m_batch_vertex_space >= MAX_VERTICES_FOR_RECTANGLE &&
   2584                   m_batch_index_space >= MAX_VERTICES_FOR_RECTANGLE);
   2585 
   2586       // Split the rectangle into multiple quads if it's greater than 256x256, as the texture page should repeat.
   2587       u32 tex_top = orig_tex_top;
   2588       for (u32 y_offset = 0; y_offset < rectangle_height;)
   2589       {
   2590         const s32 quad_height = std::min(rectangle_height - y_offset, TEXTURE_PAGE_WIDTH - tex_top);
   2591         const float quad_start_y = static_cast<float>(pos_y + static_cast<s32>(y_offset));
   2592         const float quad_end_y = quad_start_y + static_cast<float>(quad_height);
   2593         const u32 tex_bottom = tex_top + quad_height;
   2594 
   2595         u32 tex_left = orig_tex_left;
   2596         for (u32 x_offset = 0; x_offset < rectangle_width;)
   2597         {
   2598           const s32 quad_width = std::min(rectangle_width - x_offset, TEXTURE_PAGE_HEIGHT - tex_left);
   2599           const float quad_start_x = static_cast<float>(pos_x + static_cast<s32>(x_offset));
   2600           const float quad_end_x = quad_start_x + static_cast<float>(quad_width);
   2601           const u32 tex_right = tex_left + quad_width;
   2602           const u32 uv_limits = BatchVertex::PackUVLimits(tex_left, tex_right - 1, tex_top, tex_bottom - 1);
   2603 
   2604           if (rc.texture_enable && m_texpage_dirty != 0)
   2605           {
   2606             CheckForTexPageOverlap(GSVector4i(static_cast<s32>(tex_left), static_cast<s32>(tex_top),
   2607                                               static_cast<s32>(tex_right), static_cast<s32>(tex_bottom)));
   2608           }
   2609 
   2610           const u32 base_vertex = m_batch_vertex_count;
   2611           (m_batch_vertex_ptr++)
   2612             ->Set(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, Truncate16(tex_left), Truncate16(tex_top),
   2613                   uv_limits);
   2614           (m_batch_vertex_ptr++)
   2615             ->Set(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, Truncate16(tex_right), Truncate16(tex_top),
   2616                   uv_limits);
   2617           (m_batch_vertex_ptr++)
   2618             ->Set(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, Truncate16(tex_left), Truncate16(tex_bottom),
   2619                   uv_limits);
   2620           (m_batch_vertex_ptr++)
   2621             ->Set(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, Truncate16(tex_right), Truncate16(tex_bottom),
   2622                   uv_limits);
   2623           m_batch_vertex_count += 4;
   2624           m_batch_vertex_space -= 4;
   2625 
   2626           *(m_batch_index_ptr++) = Truncate16(base_vertex + 0);
   2627           *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
   2628           *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
   2629           *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
   2630           *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
   2631           *(m_batch_index_ptr++) = Truncate16(base_vertex + 3);
   2632           m_batch_index_count += 6;
   2633           m_batch_index_space -= 6;
   2634 
   2635           x_offset += quad_width;
   2636           tex_left = 0;
   2637         }
   2638 
   2639         y_offset += quad_height;
   2640         tex_top = 0;
   2641       }
   2642 
   2643       AddDrawnRectangle(clamped_rect);
   2644       AddDrawRectangleTicks(clamped_rect, rc.texture_enable, rc.transparency_enable);
   2645 
   2646       if (m_sw_renderer)
   2647       {
   2648         GPUBackendDrawRectangleCommand* cmd = m_sw_renderer->NewDrawRectangleCommand();
   2649         FillDrawCommand(cmd, rc);
   2650         cmd->color = color;
   2651         cmd->x = pos_x;
   2652         cmd->y = pos_y;
   2653         cmd->width = static_cast<u16>(rectangle_width);
   2654         cmd->height = static_cast<u16>(rectangle_height);
   2655         cmd->texcoord = (static_cast<u16>(texcoord_y) << 8) | static_cast<u16>(texcoord_x);
   2656         m_sw_renderer->PushCommand(cmd);
   2657       }
   2658     }
   2659     break;
   2660 
   2661     case GPUPrimitive::Line:
   2662     {
   2663       SetBatchDepthBuffer(false);
   2664 
   2665       if (!rc.polyline)
   2666       {
   2667         DebugAssert(m_batch_vertex_space >= 4 && m_batch_index_space >= 6);
   2668 
   2669         u32 start_color, end_color;
   2670         GPUVertexPosition start_pos, end_pos;
   2671         if (rc.shading_enable)
   2672         {
   2673           start_color = rc.color_for_first_vertex;
   2674           start_pos.bits = FifoPop();
   2675           end_color = FifoPop() & UINT32_C(0x00FFFFFF);
   2676           end_pos.bits = FifoPop();
   2677         }
   2678         else
   2679         {
   2680           start_color = end_color = rc.color_for_first_vertex;
   2681           start_pos.bits = FifoPop();
   2682           end_pos.bits = FifoPop();
   2683         }
   2684 
   2685         const GSVector4i vstart_pos = GSVector4i(start_pos.x + m_drawing_offset.x, start_pos.y + m_drawing_offset.y);
   2686         const GSVector4i vend_pos = GSVector4i(end_pos.x + m_drawing_offset.x, end_pos.y + m_drawing_offset.y);
   2687         const GSVector4i bounds = vstart_pos.xyxy(vend_pos);
   2688         const GSVector4i rect =
   2689           vstart_pos.min_i32(vend_pos).xyxy(vstart_pos.max_i32(vend_pos)).add32(GSVector4i::cxpr(0, 0, 1, 1));
   2690         const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
   2691 
   2692         if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
   2693         {
   2694           GL_INS_FMT("Culling too-large/off-screen line: {},{} - {},{}", bounds.x, bounds.y, bounds.z, bounds.w);
   2695           return;
   2696         }
   2697 
   2698         AddDrawnRectangle(clamped_rect);
   2699         AddDrawLineTicks(clamped_rect, rc.shading_enable);
   2700 
   2701         // TODO: Should we do a PGXP lookup here? Most lines are 2D.
   2702         DrawLine(GSVector4(bounds), start_color, end_color, depth);
   2703 
   2704         if (m_sw_renderer)
   2705         {
   2706           GPUBackendDrawLineCommand* cmd = m_sw_renderer->NewDrawLineCommand(2);
   2707           FillDrawCommand(cmd, rc);
   2708           GSVector4i::storel(&cmd->vertices[0], bounds);
   2709           cmd->vertices[0].color = start_color;
   2710           GSVector4i::storeh(&cmd->vertices[1], bounds);
   2711           cmd->vertices[1].color = end_color;
   2712           m_sw_renderer->PushCommand(cmd);
   2713         }
   2714       }
   2715       else
   2716       {
   2717         // Multiply by two because we don't use line strips.
   2718         const u32 num_vertices = GetPolyLineVertexCount();
   2719         DebugAssert(m_batch_vertex_space >= (num_vertices * 4) && m_batch_index_space >= (num_vertices * 6));
   2720 
   2721         const bool shaded = rc.shading_enable;
   2722 
   2723         u32 buffer_pos = 0;
   2724         const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
   2725         GSVector4i start_pos = GSVector4i(start_vp.x + m_drawing_offset.x, start_vp.y + m_drawing_offset.y);
   2726         u32 start_color = rc.color_for_first_vertex;
   2727 
   2728         GPUBackendDrawLineCommand* cmd;
   2729         if (m_sw_renderer)
   2730         {
   2731           cmd = m_sw_renderer->NewDrawLineCommand(num_vertices);
   2732           FillDrawCommand(cmd, rc);
   2733           GSVector4i::storel(&cmd->vertices[0].x, start_pos);
   2734           cmd->vertices[0].color = start_color;
   2735         }
   2736         else
   2737         {
   2738           cmd = nullptr;
   2739         }
   2740 
   2741         for (u32 i = 1; i < num_vertices; i++)
   2742         {
   2743           const u32 end_color = shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : start_color;
   2744           const GPUVertexPosition vp{m_blit_buffer[buffer_pos++]};
   2745           const GSVector4i end_pos = GSVector4i(m_drawing_offset.x + vp.x, m_drawing_offset.y + vp.y);
   2746           const GSVector4i bounds = start_pos.xyxy(end_pos);
   2747           const GSVector4i rect =
   2748             start_pos.min_i32(end_pos).xyxy(start_pos.max_i32(end_pos)).add32(GSVector4i::cxpr(0, 0, 1, 1));
   2749           const GSVector4i clamped_rect = rect.rintersect(m_clamped_drawing_area);
   2750           if (rect.width() > MAX_PRIMITIVE_WIDTH || rect.height() > MAX_PRIMITIVE_HEIGHT || clamped_rect.rempty())
   2751           {
   2752             GL_INS_FMT("Culling too-large line: {},{} - {},{}", start_pos.x, start_pos.y, end_pos.x, end_pos.y);
   2753           }
   2754           else
   2755           {
   2756             AddDrawnRectangle(clamped_rect);
   2757             AddDrawLineTicks(clamped_rect, rc.shading_enable);
   2758 
   2759             // TODO: Should we do a PGXP lookup here? Most lines are 2D.
   2760             DrawLine(GSVector4(bounds), start_color, end_color, depth);
   2761           }
   2762 
   2763           start_pos = end_pos;
   2764           start_color = end_color;
   2765 
   2766           if (cmd)
   2767           {
   2768             GSVector4i::storel(&cmd->vertices[i], end_pos);
   2769             cmd->vertices[i].color = end_color;
   2770           }
   2771         }
   2772 
   2773         if (cmd)
   2774           m_sw_renderer->PushCommand(cmd);
   2775       }
   2776     }
   2777     break;
   2778 
   2779     default:
   2780       UnreachableCode();
   2781       break;
   2782   }
   2783 }
   2784 
   2785 bool GPU_HW::BlitVRAMReplacementTexture(const TextureReplacements::ReplacementImage* tex, u32 dst_x, u32 dst_y,
   2786                                         u32 width, u32 height)
   2787 {
   2788   if (!m_vram_replacement_texture || m_vram_replacement_texture->GetWidth() < tex->GetWidth() ||
   2789       m_vram_replacement_texture->GetHeight() < tex->GetHeight() || g_gpu_device->GetFeatures().prefer_unused_textures)
   2790   {
   2791     g_gpu_device->RecycleTexture(std::move(m_vram_replacement_texture));
   2792 
   2793     if (!(m_vram_replacement_texture =
   2794             g_gpu_device->FetchTexture(tex->GetWidth(), tex->GetHeight(), 1, 1, 1, GPUTexture::Type::DynamicTexture,
   2795                                        GPUTexture::Format::RGBA8, tex->GetPixels(), tex->GetPitch())))
   2796     {
   2797       return false;
   2798     }
   2799   }
   2800   else
   2801   {
   2802     if (!m_vram_replacement_texture->Update(0, 0, tex->GetWidth(), tex->GetHeight(), tex->GetPixels(), tex->GetPitch()))
   2803     {
   2804       ERROR_LOG("Update {}x{} texture failed.", width, height);
   2805       return false;
   2806     }
   2807   }
   2808 
   2809   GL_SCOPE_FMT("BlitVRAMReplacementTexture() {}x{} to {},{} => {},{} ({}x{})", tex->GetWidth(), tex->GetHeight(), dst_x,
   2810                dst_y, dst_x + width, dst_y + height, width, height);
   2811 
   2812   const float src_rect[4] = {
   2813     0.0f, 0.0f, static_cast<float>(tex->GetWidth()) / static_cast<float>(m_vram_replacement_texture->GetWidth()),
   2814     static_cast<float>(tex->GetHeight()) / static_cast<float>(m_vram_replacement_texture->GetHeight())};
   2815 
   2816   g_gpu_device->SetTextureSampler(0, m_vram_replacement_texture.get(), g_gpu_device->GetLinearSampler());
   2817   g_gpu_device->SetPipeline(m_vram_write_replacement_pipeline.get());
   2818   g_gpu_device->SetViewportAndScissor(dst_x, dst_y, width, height);
   2819   g_gpu_device->PushUniformBuffer(src_rect, sizeof(src_rect));
   2820   g_gpu_device->Draw(3, 0);
   2821 
   2822   RestoreDeviceContext();
   2823   return true;
   2824 }
   2825 
   2826 ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
   2827 {
   2828   DebugAssert(m_texpage_dirty != 0 && m_batch.texture_mode != BatchTextureMode::Disabled);
   2829 
   2830   if (m_texture_window_active)
   2831   {
   2832     const GSVector4i twin = GSVector4i::load<false>(m_batch_ubo_data.u_texture_window);
   2833     uv_rect = ((uv_rect & twin.xyxy()) | twin.zwzw());
   2834 
   2835     // Min could be greater than max after applying window, correct for it.
   2836     uv_rect = uv_rect.min_i32(uv_rect.zwzw()).max_i32(uv_rect.xyxy());
   2837   }
   2838 
   2839   const GPUTextureMode tmode = m_draw_mode.mode_reg.texture_mode;
   2840   const u32 xshift = (tmode >= GPUTextureMode::Direct16Bit) ? 0 : (2 - static_cast<u8>(tmode));
   2841   const GSVector4i page_offset = GSVector4i::loadl(m_current_texture_page_offset).xyxy();
   2842 
   2843   uv_rect = uv_rect.blend32<5>(uv_rect.srl32(xshift));   // shift only goes on the x
   2844   uv_rect = uv_rect.add32(page_offset);                  // page offset
   2845   uv_rect = uv_rect.add32(GSVector4i::cxpr(0, 0, 1, 1)); // make exclusive
   2846   uv_rect = uv_rect.rintersect(VRAM_SIZE_RECT);          // clamp to vram bounds
   2847 
   2848   const GSVector4i new_uv_rect = m_current_uv_rect.runion(uv_rect);
   2849 
   2850   if (!m_current_uv_rect.eq(new_uv_rect))
   2851   {
   2852     m_current_uv_rect = new_uv_rect;
   2853 
   2854     bool update_drawn = false, update_written = false;
   2855     if (m_texpage_dirty & TEXPAGE_DIRTY_DRAWN_RECT)
   2856     {
   2857       DebugAssert(!m_vram_dirty_draw_rect.eq(INVALID_RECT));
   2858       update_drawn = m_current_uv_rect.rintersects(m_vram_dirty_draw_rect);
   2859       if (update_drawn)
   2860       {
   2861         GL_INS_FMT("Updating VRAM cache due to UV {} intersection with dirty DRAW {}", m_current_uv_rect,
   2862                    m_vram_dirty_draw_rect);
   2863       }
   2864     }
   2865     if (m_texpage_dirty & TEXPAGE_DIRTY_WRITTEN_RECT)
   2866     {
   2867       DebugAssert(!m_vram_dirty_write_rect.eq(INVALID_RECT));
   2868       update_written = m_current_uv_rect.rintersects(m_vram_dirty_write_rect);
   2869       if (update_written)
   2870       {
   2871         GL_INS_FMT("Updating VRAM cache due to UV {} intersection with dirty WRITE {}", m_current_uv_rect,
   2872                    m_vram_dirty_write_rect);
   2873       }
   2874     }
   2875 
   2876     if (update_drawn || update_written)
   2877     {
   2878       if (m_batch_index_count > 0)
   2879       {
   2880         FlushRender();
   2881         EnsureVertexBufferSpaceForCurrentCommand();
   2882       }
   2883 
   2884       UpdateVRAMReadTexture(update_drawn, update_written);
   2885     }
   2886   }
   2887 }
   2888 
   2889 ALWAYS_INLINE bool GPU_HW::IsFlushed() const
   2890 {
   2891   return (m_batch_index_count == 0);
   2892 }
   2893 
   2894 ALWAYS_INLINE_RELEASE bool GPU_HW::NeedsTwoPassRendering() const
   2895 {
   2896   // We need two-pass rendering when using BG-FG blending and texturing, as the transparency can be enabled
   2897   // on a per-pixel basis, and the opaque pixels shouldn't be blended at all.
   2898 
   2899   return (m_batch.texture_mode != BatchTextureMode::Disabled &&
   2900           (m_batch.transparency_mode == GPUTransparencyMode::BackgroundMinusForeground ||
   2901            (!m_supports_dual_source_blend && m_batch.transparency_mode != GPUTransparencyMode::Disabled)));
   2902 }
   2903 
   2904 ALWAYS_INLINE_RELEASE bool GPU_HW::NeedsShaderBlending(GPUTransparencyMode transparency, BatchTextureMode texture_mode,
   2905                                                        bool check_mask) const
   2906 {
   2907   return (m_allow_shader_blend &&
   2908           ((check_mask && !m_write_mask_as_depth) ||
   2909            (transparency != GPUTransparencyMode::Disabled && m_prefer_shader_blend) ||
   2910            (transparency == GPUTransparencyMode::BackgroundMinusForeground) ||
   2911            (!m_supports_dual_source_blend && texture_mode != BatchTextureMode::Disabled &&
   2912             (transparency != GPUTransparencyMode::Disabled || IsBlendedTextureFiltering(m_texture_filtering) ||
   2913              IsBlendedTextureFiltering(m_sprite_texture_filtering)))));
   2914 }
   2915 
   2916 void GPU_HW::EnsureVertexBufferSpace(u32 required_vertices, u32 required_indices)
   2917 {
   2918   if (m_batch_vertex_ptr)
   2919   {
   2920     if (m_batch_vertex_space >= required_vertices && m_batch_index_space >= required_indices)
   2921       return;
   2922 
   2923     FlushRender();
   2924   }
   2925 
   2926   MapGPUBuffer(required_vertices, required_indices);
   2927 }
   2928 
   2929 void GPU_HW::EnsureVertexBufferSpaceForCurrentCommand()
   2930 {
   2931   u32 required_vertices;
   2932   u32 required_indices;
   2933   switch (m_render_command.primitive)
   2934   {
   2935     case GPUPrimitive::Polygon:
   2936       required_vertices = 4; // assume quad, in case of expansion
   2937       required_indices = 6;
   2938       break;
   2939     case GPUPrimitive::Rectangle:
   2940       required_vertices = MAX_VERTICES_FOR_RECTANGLE; // TODO: WRong
   2941       required_indices = MAX_VERTICES_FOR_RECTANGLE;
   2942       break;
   2943     case GPUPrimitive::Line:
   2944     {
   2945       // assume expansion
   2946       const u32 vert_count = m_render_command.polyline ? GetPolyLineVertexCount() : 2;
   2947       required_vertices = vert_count * 4;
   2948       required_indices = vert_count * 6;
   2949     }
   2950     break;
   2951 
   2952     default:
   2953       UnreachableCode();
   2954   }
   2955 
   2956   // can we fit these vertices in the current depth buffer range?
   2957   if ((m_current_depth + required_vertices) > MAX_BATCH_VERTEX_COUNTER_IDS)
   2958   {
   2959     FlushRender();
   2960     ResetBatchVertexDepth();
   2961     MapGPUBuffer(required_vertices, required_indices);
   2962     return;
   2963   }
   2964 
   2965   EnsureVertexBufferSpace(required_vertices, required_indices);
   2966 }
   2967 
   2968 void GPU_HW::ResetBatchVertexDepth()
   2969 {
   2970   DEV_LOG("Resetting batch vertex depth");
   2971 
   2972   if (m_write_mask_as_depth)
   2973     UpdateDepthBufferFromMaskBit();
   2974 
   2975   m_current_depth = 1;
   2976 }
   2977 
   2978 ALWAYS_INLINE float GPU_HW::GetCurrentNormalizedVertexDepth() const
   2979 {
   2980   return 1.0f - (static_cast<float>(m_current_depth) / 65535.0f);
   2981 }
   2982 
   2983 void GPU_HW::UpdateSoftwareRenderer(bool copy_vram_from_hw)
   2984 {
   2985   const bool current_enabled = (m_sw_renderer != nullptr);
   2986   const bool new_enabled = g_settings.gpu_use_software_renderer_for_readbacks;
   2987   if (current_enabled == new_enabled)
   2988     return;
   2989 
   2990   if (!new_enabled)
   2991   {
   2992     if (m_sw_renderer)
   2993       m_sw_renderer->Shutdown();
   2994     m_sw_renderer.reset();
   2995     return;
   2996   }
   2997 
   2998   std::unique_ptr<GPU_SW_Backend> sw_renderer = std::make_unique<GPU_SW_Backend>();
   2999   if (!sw_renderer->Initialize(true))
   3000     return;
   3001 
   3002   // We need to fill in the SW renderer's VRAM with the current state for hot toggles.
   3003   if (copy_vram_from_hw)
   3004   {
   3005     FlushRender();
   3006     ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
   3007 
   3008     // Sync the drawing area and CLUT.
   3009     GPUBackendSetDrawingAreaCommand* clip_cmd = sw_renderer->NewSetDrawingAreaCommand();
   3010     clip_cmd->new_area = m_drawing_area;
   3011     sw_renderer->PushCommand(clip_cmd);
   3012 
   3013     if (IsCLUTValid())
   3014     {
   3015       GPUBackendUpdateCLUTCommand* clut_cmd = sw_renderer->NewUpdateCLUTCommand();
   3016       FillBackendCommandParameters(clut_cmd);
   3017       clut_cmd->reg.bits = static_cast<u16>(m_current_clut_reg_bits);
   3018       clut_cmd->clut_is_8bit = m_current_clut_is_8bit;
   3019       sw_renderer->PushCommand(clut_cmd);
   3020     }
   3021   }
   3022 
   3023   m_sw_renderer = std::move(sw_renderer);
   3024 }
   3025 
   3026 void GPU_HW::FillBackendCommandParameters(GPUBackendCommand* cmd) const
   3027 {
   3028   cmd->params.bits = 0;
   3029   cmd->params.check_mask_before_draw = m_GPUSTAT.check_mask_before_draw;
   3030   cmd->params.set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing;
   3031   cmd->params.active_line_lsb = m_crtc_state.active_line_lsb;
   3032   cmd->params.interlaced_rendering = m_GPUSTAT.SkipDrawingToActiveField();
   3033 }
   3034 
   3035 void GPU_HW::FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const
   3036 {
   3037   FillBackendCommandParameters(cmd);
   3038   cmd->rc.bits = rc.bits;
   3039   cmd->draw_mode.bits = m_draw_mode.mode_reg.bits;
   3040   cmd->palette.bits = m_draw_mode.palette_reg.bits;
   3041   cmd->window = m_draw_mode.texture_window;
   3042 }
   3043 
   3044 void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
   3045 {
   3046   GL_SCOPE_FMT("FillVRAM({},{} => {},{} ({}x{}) with 0x{:08X}", x, y, x + width, y + height, width, height, color);
   3047   DeactivateROV();
   3048 
   3049   if (m_sw_renderer)
   3050   {
   3051     GPUBackendFillVRAMCommand* cmd = m_sw_renderer->NewFillVRAMCommand();
   3052     FillBackendCommandParameters(cmd);
   3053     cmd->x = static_cast<u16>(x);
   3054     cmd->y = static_cast<u16>(y);
   3055     cmd->width = static_cast<u16>(width);
   3056     cmd->height = static_cast<u16>(height);
   3057     cmd->color = color;
   3058     m_sw_renderer->PushCommand(cmd);
   3059   }
   3060 
   3061   GL_INS_FMT("Dirty draw area before: {}", m_vram_dirty_draw_rect);
   3062 
   3063   const GSVector4i bounds = GetVRAMTransferBounds(x, y, width, height);
   3064   AddUnclampedDrawnRectangle(bounds);
   3065 
   3066   GL_INS_FMT("Dirty draw area after: {}", m_vram_dirty_draw_rect);
   3067 
   3068   const bool is_oversized = (((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT));
   3069   g_gpu_device->SetPipeline(
   3070     m_vram_fill_pipelines[BoolToUInt8(is_oversized)][BoolToUInt8(IsInterlacedRenderingEnabled())].get());
   3071 
   3072   const GSVector4i scaled_bounds = bounds.mul32l(GSVector4i(m_resolution_scale));
   3073   g_gpu_device->SetViewportAndScissor(scaled_bounds);
   3074 
   3075   struct VRAMFillUBOData
   3076   {
   3077     u32 u_dst_x;
   3078     u32 u_dst_y;
   3079     u32 u_end_x;
   3080     u32 u_end_y;
   3081     std::array<float, 4> u_fill_color;
   3082     u32 u_interlaced_displayed_field;
   3083   };
   3084   VRAMFillUBOData uniforms;
   3085   uniforms.u_dst_x = (x % VRAM_WIDTH) * m_resolution_scale;
   3086   uniforms.u_dst_y = (y % VRAM_HEIGHT) * m_resolution_scale;
   3087   uniforms.u_end_x = ((x + width) % VRAM_WIDTH) * m_resolution_scale;
   3088   uniforms.u_end_y = ((y + height) % VRAM_HEIGHT) * m_resolution_scale;
   3089   // drop precision unless true colour is enabled
   3090   uniforms.u_fill_color =
   3091     GPUDevice::RGBA8ToFloat(m_true_color ? color : VRAMRGBA5551ToRGBA8888(VRAMRGBA8888ToRGBA5551(color)));
   3092   uniforms.u_interlaced_displayed_field = GetActiveLineLSB();
   3093   g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
   3094   g_gpu_device->Draw(3, 0);
   3095 
   3096   RestoreDeviceContext();
   3097 }
   3098 
   3099 void GPU_HW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
   3100 {
   3101   GL_PUSH_FMT("ReadVRAM({},{} => {},{} ({}x{})", x, y, x + width, y + height, width, height);
   3102 
   3103   if (m_sw_renderer)
   3104   {
   3105     m_sw_renderer->Sync(false);
   3106     GL_POP();
   3107     return;
   3108   }
   3109 
   3110   // Get bounds with wrap-around handled.
   3111   GSVector4i copy_rect = GetVRAMTransferBounds(x, y, width, height);
   3112 
   3113   // Has to be aligned to an even pixel for the download, due to 32-bit packing.
   3114   if (copy_rect.left & 1)
   3115     copy_rect.left--;
   3116   if (copy_rect.right & 1)
   3117     copy_rect.right++;
   3118 
   3119   DebugAssert((copy_rect.left % 2) == 0 && (copy_rect.width() % 2) == 0);
   3120   const u32 encoded_left = copy_rect.left / 2;
   3121   const u32 encoded_top = copy_rect.top;
   3122   const u32 encoded_width = copy_rect.width() / 2;
   3123   const u32 encoded_height = copy_rect.height();
   3124 
   3125   // Encode the 24-bit texture as 16-bit.
   3126   const s32 uniforms[4] = {copy_rect.left, copy_rect.top, copy_rect.width(), copy_rect.height()};
   3127   g_gpu_device->SetRenderTarget(m_vram_readback_texture.get());
   3128   g_gpu_device->SetPipeline(m_vram_readback_pipeline.get());
   3129   g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler());
   3130   g_gpu_device->SetViewportAndScissor(0, 0, encoded_width, encoded_height);
   3131   g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
   3132   g_gpu_device->Draw(3, 0);
   3133   m_vram_readback_texture->MakeReadyForSampling();
   3134   GL_POP();
   3135 
   3136   // Stage the readback and copy it into our shadow buffer.
   3137   if (m_vram_readback_download_texture->IsImported())
   3138   {
   3139     // Fast path, read directly.
   3140     m_vram_readback_download_texture->CopyFromTexture(encoded_left, encoded_top, m_vram_readback_texture.get(), 0, 0,
   3141                                                       encoded_width, encoded_height, 0, 0, false);
   3142     m_vram_readback_download_texture->Flush();
   3143   }
   3144   else
   3145   {
   3146     // Copy to staging buffer, then to VRAM.
   3147     m_vram_readback_download_texture->CopyFromTexture(0, 0, m_vram_readback_texture.get(), 0, 0, encoded_width,
   3148                                                       encoded_height, 0, 0, true);
   3149     m_vram_readback_download_texture->ReadTexels(0, 0, encoded_width, encoded_height,
   3150                                                  &g_vram[copy_rect.top * VRAM_WIDTH + copy_rect.left],
   3151                                                  VRAM_WIDTH * sizeof(u16));
   3152   }
   3153 
   3154   RestoreDeviceContext();
   3155 }
   3156 
   3157 void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
   3158 {
   3159   GL_SCOPE_FMT("UpdateVRAM({},{} => {},{} ({}x{})", x, y, x + width, y + height, width, height);
   3160 
   3161   if (m_sw_renderer)
   3162   {
   3163     const u32 num_words = width * height;
   3164     GPUBackendUpdateVRAMCommand* cmd = m_sw_renderer->NewUpdateVRAMCommand(num_words);
   3165     FillBackendCommandParameters(cmd);
   3166     cmd->params.set_mask_while_drawing = set_mask;
   3167     cmd->params.check_mask_before_draw = check_mask;
   3168     cmd->x = static_cast<u16>(x);
   3169     cmd->y = static_cast<u16>(y);
   3170     cmd->width = static_cast<u16>(width);
   3171     cmd->height = static_cast<u16>(height);
   3172     std::memcpy(cmd->data, data, sizeof(u16) * num_words);
   3173     m_sw_renderer->PushCommand(cmd);
   3174   }
   3175 
   3176   const GSVector4i bounds = GetVRAMTransferBounds(x, y, width, height);
   3177   DebugAssert(bounds.right <= static_cast<s32>(VRAM_WIDTH) && bounds.bottom <= static_cast<s32>(VRAM_HEIGHT));
   3178   AddWrittenRectangle(bounds);
   3179 
   3180   if (check_mask)
   3181   {
   3182     // set new vertex counter since we want this to take into consideration previous masked pixels
   3183     m_current_depth++;
   3184   }
   3185   else
   3186   {
   3187     const TextureReplacements::ReplacementImage* rtex = TextureReplacements::GetVRAMReplacement(width, height, data);
   3188     if (rtex && BlitVRAMReplacementTexture(rtex, x * m_resolution_scale, y * m_resolution_scale,
   3189                                            width * m_resolution_scale, height * m_resolution_scale))
   3190     {
   3191       return;
   3192     }
   3193   }
   3194 
   3195   UpdateVRAMOnGPU(x, y, width, height, data, sizeof(u16) * width, set_mask, check_mask, bounds);
   3196 }
   3197 
   3198 void GPU_HW::UpdateVRAMOnGPU(u32 x, u32 y, u32 width, u32 height, const void* data, u32 data_pitch, bool set_mask,
   3199                              bool check_mask, const GSVector4i bounds)
   3200 {
   3201   DeactivateROV();
   3202 
   3203   std::unique_ptr<GPUTexture> upload_texture;
   3204   u32 map_index;
   3205 
   3206   if (!g_gpu_device->GetFeatures().supports_texture_buffers)
   3207   {
   3208     map_index = 0;
   3209     upload_texture = g_gpu_device->FetchTexture(width, height, 1, 1, 1, GPUTexture::Type::Texture,
   3210                                                 GPUTexture::Format::R16U, data, data_pitch);
   3211     if (!upload_texture)
   3212     {
   3213       ERROR_LOG("Failed to get {}x{} upload texture. Things are gonna break.", width, height);
   3214       return;
   3215     }
   3216   }
   3217   else
   3218   {
   3219     const u32 num_pixels = width * height;
   3220     const u32 dst_pitch = width * sizeof(u16);
   3221     void* map = m_vram_upload_buffer->Map(num_pixels);
   3222     map_index = m_vram_upload_buffer->GetCurrentPosition();
   3223     StringUtil::StrideMemCpy(map, dst_pitch, data, data_pitch, dst_pitch, height);
   3224     m_vram_upload_buffer->Unmap(num_pixels);
   3225   }
   3226 
   3227   struct VRAMWriteUBOData
   3228   {
   3229     u32 u_dst_x;
   3230     u32 u_dst_y;
   3231     u32 u_end_x;
   3232     u32 u_end_y;
   3233     u32 u_width;
   3234     u32 u_height;
   3235     u32 u_buffer_base_offset;
   3236     u32 u_mask_or_bits;
   3237     float u_depth_value;
   3238   };
   3239   const VRAMWriteUBOData uniforms = {
   3240     (x % VRAM_WIDTH), (y % VRAM_HEIGHT), ((x + width) % VRAM_WIDTH),  ((y + height) % VRAM_HEIGHT),     width,
   3241     height,           map_index,         (set_mask) ? 0x8000u : 0x00, GetCurrentNormalizedVertexDepth()};
   3242 
   3243   // the viewport should already be set to the full vram, so just adjust the scissor
   3244   const GSVector4i scaled_bounds = bounds.mul32l(GSVector4i(m_resolution_scale));
   3245   g_gpu_device->SetScissor(scaled_bounds.left, scaled_bounds.top, scaled_bounds.width(), scaled_bounds.height());
   3246   g_gpu_device->SetPipeline(m_vram_write_pipelines[BoolToUInt8(check_mask && m_write_mask_as_depth)].get());
   3247   g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
   3248   if (upload_texture)
   3249   {
   3250     g_gpu_device->SetTextureSampler(0, upload_texture.get(), g_gpu_device->GetNearestSampler());
   3251     g_gpu_device->Draw(3, 0);
   3252     g_gpu_device->RecycleTexture(std::move(upload_texture));
   3253   }
   3254   else
   3255   {
   3256     g_gpu_device->SetTextureBuffer(0, m_vram_upload_buffer.get());
   3257     g_gpu_device->Draw(3, 0);
   3258   }
   3259 
   3260   RestoreDeviceContext();
   3261 }
   3262 
   3263 void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
   3264 {
   3265   GL_SCOPE_FMT("CopyVRAM({}x{} @ {},{} => {},{}", width, height, src_x, src_y, dst_x, dst_y);
   3266 
   3267   if (m_sw_renderer)
   3268   {
   3269     GPUBackendCopyVRAMCommand* cmd = m_sw_renderer->NewCopyVRAMCommand();
   3270     FillBackendCommandParameters(cmd);
   3271     cmd->src_x = static_cast<u16>(src_x);
   3272     cmd->src_y = static_cast<u16>(src_y);
   3273     cmd->dst_x = static_cast<u16>(dst_x);
   3274     cmd->dst_y = static_cast<u16>(dst_y);
   3275     cmd->width = static_cast<u16>(width);
   3276     cmd->height = static_cast<u16>(height);
   3277     m_sw_renderer->PushCommand(cmd);
   3278   }
   3279 
   3280   // masking enabled, oversized, or overlapping
   3281   const bool use_shader =
   3282     (m_GPUSTAT.IsMaskingEnabled() || ((src_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
   3283      ((src_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT || ((dst_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
   3284      ((dst_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT);
   3285   const GSVector4i src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height);
   3286   const GSVector4i dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height);
   3287   const bool intersect_with_draw = m_vram_dirty_draw_rect.rintersects(src_bounds);
   3288   const bool intersect_with_write = m_vram_dirty_write_rect.rintersects(src_bounds);
   3289 
   3290   if (use_shader || IsUsingMultisampling())
   3291   {
   3292     if (intersect_with_draw || intersect_with_write)
   3293       UpdateVRAMReadTexture(intersect_with_draw, intersect_with_write);
   3294     AddUnclampedDrawnRectangle(dst_bounds);
   3295 
   3296     DeactivateROV();
   3297 
   3298     struct VRAMCopyUBOData
   3299     {
   3300       u32 u_src_x;
   3301       u32 u_src_y;
   3302       u32 u_dst_x;
   3303       u32 u_dst_y;
   3304       u32 u_end_x;
   3305       u32 u_end_y;
   3306       u32 u_width;
   3307       u32 u_height;
   3308       u32 u_set_mask_bit;
   3309       float u_depth_value;
   3310     };
   3311     const VRAMCopyUBOData uniforms = {(src_x % VRAM_WIDTH) * m_resolution_scale,
   3312                                       (src_y % VRAM_HEIGHT) * m_resolution_scale,
   3313                                       (dst_x % VRAM_WIDTH) * m_resolution_scale,
   3314                                       (dst_y % VRAM_HEIGHT) * m_resolution_scale,
   3315                                       ((dst_x + width) % VRAM_WIDTH) * m_resolution_scale,
   3316                                       ((dst_y + height) % VRAM_HEIGHT) * m_resolution_scale,
   3317                                       width * m_resolution_scale,
   3318                                       height * m_resolution_scale,
   3319                                       m_GPUSTAT.set_mask_while_drawing ? 1u : 0u,
   3320                                       GetCurrentNormalizedVertexDepth()};
   3321 
   3322     // VRAM read texture should already be bound.
   3323     const GSVector4i dst_bounds_scaled = dst_bounds.mul32l(GSVector4i(m_resolution_scale));
   3324     g_gpu_device->SetViewportAndScissor(dst_bounds_scaled);
   3325     g_gpu_device->SetPipeline(
   3326       m_vram_copy_pipelines[BoolToUInt8(m_GPUSTAT.check_mask_before_draw && m_write_mask_as_depth)].get());
   3327     g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
   3328     g_gpu_device->Draw(3, 0);
   3329     RestoreDeviceContext();
   3330 
   3331     if (m_GPUSTAT.check_mask_before_draw && !m_pgxp_depth_buffer)
   3332       m_current_depth++;
   3333 
   3334     return;
   3335   }
   3336 
   3337   GPUTexture* src_tex = m_vram_texture.get();
   3338   const bool overlaps_with_self = src_bounds.rintersects(dst_bounds);
   3339   if (!g_gpu_device->GetFeatures().texture_copy_to_self || overlaps_with_self)
   3340   {
   3341     src_tex = m_vram_read_texture.get();
   3342     if (intersect_with_draw || intersect_with_write)
   3343       UpdateVRAMReadTexture(intersect_with_draw, intersect_with_write);
   3344   }
   3345 
   3346   if (intersect_with_draw)
   3347   {
   3348     AddUnclampedDrawnRectangle(dst_bounds);
   3349   }
   3350   else if (intersect_with_write)
   3351   {
   3352     AddWrittenRectangle(dst_bounds);
   3353   }
   3354   else
   3355   {
   3356     const bool use_write =
   3357       (!m_vram_dirty_write_rect.eq(INVALID_RECT) && !m_vram_dirty_draw_rect.eq(INVALID_RECT) &&
   3358        RectDistance(m_vram_dirty_write_rect, dst_bounds) < RectDistance(m_vram_dirty_draw_rect, dst_bounds));
   3359     if (use_write)
   3360       AddWrittenRectangle(dst_bounds);
   3361     else
   3362       AddUnclampedDrawnRectangle(dst_bounds);
   3363   }
   3364 
   3365   if (m_GPUSTAT.check_mask_before_draw)
   3366   {
   3367     // set new vertex counter since we want this to take into consideration previous masked pixels
   3368     m_current_depth++;
   3369   }
   3370 
   3371   g_gpu_device->CopyTextureRegion(m_vram_texture.get(), dst_x * m_resolution_scale, dst_y * m_resolution_scale, 0, 0,
   3372                                   src_tex, src_x * m_resolution_scale, src_y * m_resolution_scale, 0, 0,
   3373                                   width * m_resolution_scale, height * m_resolution_scale);
   3374   if (src_tex != m_vram_texture.get())
   3375     m_vram_read_texture->MakeReadyForSampling();
   3376 }
   3377 
   3378 void GPU_HW::DispatchRenderCommand()
   3379 {
   3380   const GPURenderCommand rc{m_render_command.bits};
   3381 
   3382   BatchTextureMode texture_mode = BatchTextureMode::Disabled;
   3383   if (rc.IsTexturingEnabled())
   3384   {
   3385     // texture page changed - check that the new page doesn't intersect the drawing area
   3386     if (m_draw_mode.IsTexturePageChanged())
   3387     {
   3388       m_draw_mode.ClearTexturePageChangedFlag();
   3389 
   3390 #if 0
   3391       if (!m_vram_dirty_draw_rect.eq(INVALID_RECT) || !m_vram_dirty_write_rect.eq(INVALID_RECT))
   3392       {
   3393         GL_INS_FMT("VRAM DIRTY: {} {}", m_vram_dirty_draw_rect, m_vram_dirty_write_rect);
   3394         GL_INS_FMT("PAGE RECT: {}", m_draw_mode.mode_reg.GetTexturePageRectangle());
   3395         if (m_draw_mode.mode_reg.IsUsingPalette())
   3396           GL_INS_FMT("PALETTE RECT: {}", m_draw_mode.palette_reg.GetRectangle(m_draw_mode.mode_reg.texture_mode));
   3397       }
   3398 #endif
   3399 
   3400       if (m_draw_mode.mode_reg.IsUsingPalette())
   3401       {
   3402         const GSVector4i palette_rect = m_draw_mode.palette_reg.GetRectangle(m_draw_mode.mode_reg.texture_mode);
   3403         const bool update_drawn = palette_rect.rintersects(m_vram_dirty_draw_rect);
   3404         const bool update_written = palette_rect.rintersects(m_vram_dirty_write_rect);
   3405         if (update_drawn || update_written)
   3406         {
   3407           GL_INS("Palette in VRAM dirty area, flushing cache");
   3408           if (!IsFlushed())
   3409             FlushRender();
   3410 
   3411           UpdateVRAMReadTexture(update_drawn, update_written);
   3412         }
   3413       }
   3414 
   3415       const GSVector4i page_rect = m_draw_mode.mode_reg.GetTexturePageRectangle();
   3416       GSVector4i::storel(m_current_texture_page_offset, page_rect);
   3417 
   3418       u8 new_texpage_dirty = m_vram_dirty_draw_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_DRAWN_RECT : 0;
   3419       new_texpage_dirty |= m_vram_dirty_write_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_WRITTEN_RECT : 0;
   3420 
   3421       if (new_texpage_dirty != 0)
   3422       {
   3423         GL_INS("Texpage is in dirty area, checking UV ranges");
   3424         m_texpage_dirty = new_texpage_dirty;
   3425         m_compute_uv_range = true;
   3426         m_current_uv_rect = INVALID_RECT;
   3427       }
   3428       else
   3429       {
   3430         m_compute_uv_range = m_clamp_uvs;
   3431         if (m_texpage_dirty)
   3432           GL_INS("Texpage is no longer dirty");
   3433         m_texpage_dirty = 0;
   3434       }
   3435     }
   3436 
   3437     texture_mode = (m_draw_mode.mode_reg.texture_mode == GPUTextureMode::Reserved_Direct16Bit) ?
   3438                      BatchTextureMode::Direct16Bit :
   3439                      static_cast<BatchTextureMode>(m_draw_mode.mode_reg.texture_mode.GetValue());
   3440   }
   3441 
   3442   // has any state changed which requires a new batch?
   3443   // Reverse blending breaks with mixed transparent and opaque pixels, so we have to do one draw per polygon.
   3444   // If we have fbfetch, we don't need to draw it in two passes. Test case: Suikoden 2 shadows.
   3445   const GPUTransparencyMode transparency_mode =
   3446     rc.transparency_enable ? m_draw_mode.mode_reg.transparency_mode : GPUTransparencyMode::Disabled;
   3447   const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false;
   3448   if (texture_mode != m_batch.texture_mode || transparency_mode != m_batch.transparency_mode ||
   3449       (transparency_mode == GPUTransparencyMode::BackgroundMinusForeground && !m_allow_shader_blend) ||
   3450       dithering_enable != m_batch.dithering)
   3451   {
   3452     FlushRender();
   3453   }
   3454 
   3455   EnsureVertexBufferSpaceForCurrentCommand();
   3456 
   3457   if (m_batch_index_count == 0)
   3458   {
   3459     // transparency mode change
   3460     const bool check_mask_before_draw = m_GPUSTAT.check_mask_before_draw;
   3461     if (transparency_mode != GPUTransparencyMode::Disabled && !m_rov_active && !m_prefer_shader_blend &&
   3462         !NeedsShaderBlending(transparency_mode, texture_mode, check_mask_before_draw))
   3463     {
   3464       static constexpr float transparent_alpha[4][2] = {{0.5f, 0.5f}, {1.0f, 1.0f}, {1.0f, 1.0f}, {0.25f, 1.0f}};
   3465 
   3466       const float src_alpha_factor = transparent_alpha[static_cast<u32>(transparency_mode)][0];
   3467       const float dst_alpha_factor = transparent_alpha[static_cast<u32>(transparency_mode)][1];
   3468       m_batch_ubo_dirty |= (m_batch_ubo_data.u_src_alpha_factor != src_alpha_factor ||
   3469                             m_batch_ubo_data.u_dst_alpha_factor != dst_alpha_factor);
   3470       m_batch_ubo_data.u_src_alpha_factor = src_alpha_factor;
   3471       m_batch_ubo_data.u_dst_alpha_factor = dst_alpha_factor;
   3472     }
   3473 
   3474     const bool set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing;
   3475     if (m_batch.check_mask_before_draw != check_mask_before_draw ||
   3476         m_batch.set_mask_while_drawing != set_mask_while_drawing)
   3477     {
   3478       m_batch.check_mask_before_draw = check_mask_before_draw;
   3479       m_batch.set_mask_while_drawing = set_mask_while_drawing;
   3480       m_batch_ubo_dirty |= (m_batch_ubo_data.u_set_mask_while_drawing != BoolToUInt32(set_mask_while_drawing));
   3481       m_batch_ubo_data.u_set_mask_while_drawing = BoolToUInt32(set_mask_while_drawing);
   3482     }
   3483 
   3484     m_batch.interlacing = IsInterlacedRenderingEnabled();
   3485     if (m_batch.interlacing)
   3486     {
   3487       const u32 displayed_field = GetActiveLineLSB();
   3488       m_batch_ubo_dirty |= (m_batch_ubo_data.u_interlaced_displayed_field != displayed_field);
   3489       m_batch_ubo_data.u_interlaced_displayed_field = displayed_field;
   3490     }
   3491 
   3492     // update state
   3493     m_batch.texture_mode = texture_mode;
   3494     m_batch.transparency_mode = transparency_mode;
   3495     m_batch.dithering = dithering_enable;
   3496 
   3497     if (m_draw_mode.IsTextureWindowChanged())
   3498     {
   3499       m_draw_mode.ClearTextureWindowChangedFlag();
   3500 
   3501       m_batch_ubo_data.u_texture_window[0] = ZeroExtend32(m_draw_mode.texture_window.and_x);
   3502       m_batch_ubo_data.u_texture_window[1] = ZeroExtend32(m_draw_mode.texture_window.and_y);
   3503       m_batch_ubo_data.u_texture_window[2] = ZeroExtend32(m_draw_mode.texture_window.or_x);
   3504       m_batch_ubo_data.u_texture_window[3] = ZeroExtend32(m_draw_mode.texture_window.or_y);
   3505 
   3506       m_texture_window_active = ((m_draw_mode.texture_window.and_x & m_draw_mode.texture_window.and_y) != 0xFF ||
   3507                                  ((m_draw_mode.texture_window.or_x | m_draw_mode.texture_window.or_y) != 0));
   3508       m_batch_ubo_dirty = true;
   3509     }
   3510 
   3511     if (m_drawing_area_changed)
   3512     {
   3513       m_drawing_area_changed = false;
   3514       SetClampedDrawingArea();
   3515       SetScissor();
   3516 
   3517       if (m_pgxp_depth_buffer && m_last_depth_z < 1.0f)
   3518       {
   3519         FlushRender();
   3520         CopyAndClearDepthBuffer();
   3521         EnsureVertexBufferSpaceForCurrentCommand();
   3522       }
   3523 
   3524       if (m_sw_renderer)
   3525       {
   3526         GPUBackendSetDrawingAreaCommand* cmd = m_sw_renderer->NewSetDrawingAreaCommand();
   3527         cmd->new_area = m_drawing_area;
   3528         m_sw_renderer->PushCommand(cmd);
   3529       }
   3530     }
   3531   }
   3532 
   3533   LoadVertices();
   3534 }
   3535 
   3536 void GPU_HW::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
   3537 {
   3538   // Not done in HW, but need to forward through to SW if using that for readbacks
   3539   if (m_sw_renderer)
   3540   {
   3541     GPUBackendUpdateCLUTCommand* cmd = m_sw_renderer->NewUpdateCLUTCommand();
   3542     FillBackendCommandParameters(cmd);
   3543     cmd->reg.bits = reg.bits;
   3544     cmd->clut_is_8bit = clut_is_8bit;
   3545     m_sw_renderer->PushCommand(cmd);
   3546   }
   3547 }
   3548 
   3549 void GPU_HW::FlushRender()
   3550 {
   3551   const u32 base_vertex = m_batch_base_vertex;
   3552   const u32 base_index = m_batch_base_index;
   3553   const u32 index_count = m_batch_index_count;
   3554   DebugAssert((m_batch_vertex_ptr != nullptr) == (m_batch_index_ptr != nullptr));
   3555   if (m_batch_vertex_ptr)
   3556     UnmapGPUBuffer(m_batch_vertex_count, index_count);
   3557   if (index_count == 0)
   3558     return;
   3559 
   3560 #ifdef _DEBUG
   3561   GL_SCOPE_FMT("Hardware Draw {}", ++s_draw_number);
   3562 #endif
   3563 
   3564   GL_INS_FMT("Dirty draw area: {}", m_vram_dirty_draw_rect);
   3565 
   3566   if (m_batch_ubo_dirty)
   3567   {
   3568     g_gpu_device->UploadUniformBuffer(&m_batch_ubo_data, sizeof(m_batch_ubo_data));
   3569     // m_counters.num_ubo_updates++;
   3570     m_batch_ubo_dirty = false;
   3571   }
   3572 
   3573   if (m_wireframe_mode != GPUWireframeMode::OnlyWireframe)
   3574   {
   3575     if (NeedsShaderBlending(m_batch.transparency_mode, m_batch.texture_mode, m_batch.check_mask_before_draw) ||
   3576         m_rov_active || (m_use_rov_for_shader_blend && m_pgxp_depth_buffer))
   3577     {
   3578       DrawBatchVertices(BatchRenderMode::ShaderBlend, index_count, base_index, base_vertex);
   3579     }
   3580     else if (NeedsTwoPassRendering())
   3581     {
   3582       DrawBatchVertices(BatchRenderMode::OnlyOpaque, index_count, base_index, base_vertex);
   3583       DrawBatchVertices(BatchRenderMode::OnlyTransparent, index_count, base_index, base_vertex);
   3584     }
   3585     else
   3586     {
   3587       DrawBatchVertices(m_batch.GetRenderMode(), index_count, base_index, base_vertex);
   3588     }
   3589   }
   3590 
   3591   if (m_wireframe_mode != GPUWireframeMode::Disabled)
   3592   {
   3593     // This'll be less than ideal, but wireframe is for debugging, so take the perf hit.
   3594     DeactivateROV();
   3595     g_gpu_device->SetPipeline(m_wireframe_pipeline.get());
   3596     g_gpu_device->DrawIndexed(index_count, base_index, base_vertex);
   3597   }
   3598 }
   3599 
   3600 void GPU_HW::UpdateDisplay()
   3601 {
   3602   FlushRender();
   3603   DeactivateROV();
   3604 
   3605   GL_SCOPE("UpdateDisplay()");
   3606 
   3607   if (g_settings.debugging.show_vram)
   3608   {
   3609     if (IsUsingMultisampling())
   3610     {
   3611       UpdateVRAMReadTexture(true, true);
   3612       SetDisplayTexture(m_vram_read_texture.get(), nullptr, 0, 0, m_vram_read_texture->GetWidth(),
   3613                         m_vram_read_texture->GetHeight());
   3614     }
   3615     else
   3616     {
   3617       SetDisplayTexture(m_vram_texture.get(), nullptr, 0, 0, m_vram_texture->GetWidth(), m_vram_texture->GetHeight());
   3618     }
   3619 
   3620     return;
   3621   }
   3622 
   3623   const bool interlaced = IsInterlacedDisplayEnabled();
   3624   const u32 interlaced_field = GetInterlacedDisplayField();
   3625   const u32 resolution_scale = m_GPUSTAT.display_area_color_depth_24 ? 1 : m_resolution_scale;
   3626   const u32 scaled_vram_offset_x = m_crtc_state.display_vram_left * resolution_scale;
   3627   const u32 scaled_vram_offset_y = (m_crtc_state.display_vram_top * resolution_scale) +
   3628                                    ((interlaced && m_GPUSTAT.vertical_resolution) ? interlaced_field : 0);
   3629   const u32 scaled_display_width = m_crtc_state.display_vram_width * resolution_scale;
   3630   const u32 scaled_display_height = m_crtc_state.display_vram_height * resolution_scale;
   3631   const u32 read_height = interlaced ? (scaled_display_height / 2u) : scaled_display_height;
   3632   const u32 line_skip = BoolToUInt32(interlaced && m_GPUSTAT.vertical_resolution);
   3633   bool drew_anything = false;
   3634 
   3635   // Don't bother grabbing depth if postfx doesn't need it.
   3636   GPUTexture* depth_source = (!m_GPUSTAT.display_area_color_depth_24 && m_pgxp_depth_buffer &&
   3637                               PostProcessing::InternalChain.NeedsDepthBuffer()) ?
   3638                                (m_depth_was_copied ? m_vram_depth_copy_texture.get() : m_vram_depth_texture.get()) :
   3639                                nullptr;
   3640 
   3641   if (IsDisplayDisabled())
   3642   {
   3643     ClearDisplayTexture();
   3644     return;
   3645   }
   3646   else if (!m_GPUSTAT.display_area_color_depth_24 && !IsUsingMultisampling() &&
   3647            (scaled_vram_offset_x + scaled_display_width) <= m_vram_texture->GetWidth() &&
   3648            (scaled_vram_offset_y + scaled_display_height) <= m_vram_texture->GetHeight() &&
   3649            !PostProcessing::InternalChain.IsActive())
   3650   {
   3651     SetDisplayTexture(m_vram_texture.get(), depth_source, scaled_vram_offset_x, scaled_vram_offset_y,
   3652                       scaled_display_width, read_height);
   3653 
   3654     // Fast path if no copies are needed.
   3655     if (interlaced)
   3656     {
   3657       GL_INS("Deinterlace fast path");
   3658       drew_anything = true;
   3659       Deinterlace(interlaced_field, line_skip);
   3660     }
   3661     else
   3662     {
   3663       GL_INS("Direct display");
   3664     }
   3665   }
   3666   else
   3667   {
   3668     if (!m_vram_extract_texture || m_vram_extract_texture->GetWidth() != scaled_display_width ||
   3669         m_vram_extract_texture->GetHeight() != read_height)
   3670     {
   3671       if (!g_gpu_device->ResizeTexture(&m_vram_extract_texture, scaled_display_width, read_height,
   3672                                        GPUTexture::Type::RenderTarget, GPUTexture::Format::RGBA8)) [[unlikely]]
   3673       {
   3674         ClearDisplayTexture();
   3675         return;
   3676       }
   3677     }
   3678 
   3679     m_vram_texture->MakeReadyForSampling();
   3680     g_gpu_device->InvalidateRenderTarget(m_vram_extract_texture.get());
   3681 
   3682     if (depth_source &&
   3683         ((m_vram_extract_depth_texture && m_vram_extract_depth_texture->GetWidth() == scaled_display_width &&
   3684           m_vram_extract_depth_texture->GetHeight() == scaled_display_height) ||
   3685          !g_gpu_device->ResizeTexture(&m_vram_extract_depth_texture, scaled_display_width, scaled_display_height,
   3686                                       GPUTexture::Type::RenderTarget, VRAM_DS_COLOR_FORMAT)))
   3687     {
   3688       depth_source->MakeReadyForSampling();
   3689       g_gpu_device->InvalidateRenderTarget(m_vram_extract_depth_texture.get());
   3690 
   3691       GPUTexture* targets[] = {m_vram_extract_texture.get(), m_vram_extract_depth_texture.get()};
   3692       g_gpu_device->SetRenderTargets(targets, static_cast<u32>(std::size(targets)), nullptr);
   3693       g_gpu_device->SetPipeline(m_vram_extract_pipeline[2].get());
   3694 
   3695       g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler());
   3696       g_gpu_device->SetTextureSampler(1, depth_source, g_gpu_device->GetNearestSampler());
   3697     }
   3698     else
   3699     {
   3700       g_gpu_device->SetRenderTarget(m_vram_extract_texture.get());
   3701       g_gpu_device->SetPipeline(m_vram_extract_pipeline[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)].get());
   3702       g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler());
   3703     }
   3704 
   3705     const u32 reinterpret_start_x = m_crtc_state.regs.X * resolution_scale;
   3706     const u32 skip_x = (m_crtc_state.display_vram_left - m_crtc_state.regs.X) * resolution_scale;
   3707     GL_INS_FMT("VRAM extract, depth = {}, 24bpp = {}, skip_x = {}, line_skip = {}", depth_source ? "yes" : "no",
   3708                m_GPUSTAT.display_area_color_depth_24.GetValue(), skip_x, line_skip);
   3709     GL_INS_FMT("Source: {},{} => {},{} ({}x{})", reinterpret_start_x, scaled_vram_offset_y,
   3710                reinterpret_start_x + scaled_display_width, scaled_vram_offset_y + read_height, scaled_display_width,
   3711                read_height);
   3712 
   3713     const u32 uniforms[4] = {reinterpret_start_x, scaled_vram_offset_y, skip_x, line_skip};
   3714     g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
   3715 
   3716     g_gpu_device->SetViewportAndScissor(0, 0, scaled_display_width, read_height);
   3717     g_gpu_device->Draw(3, 0);
   3718 
   3719     m_vram_extract_texture->MakeReadyForSampling();
   3720     if (depth_source)
   3721     {
   3722       // Thanks DX11...
   3723       m_vram_extract_depth_texture->MakeReadyForSampling();
   3724       g_gpu_device->SetTextureSampler(1, nullptr, nullptr);
   3725     }
   3726 
   3727     drew_anything = true;
   3728 
   3729     SetDisplayTexture(m_vram_extract_texture.get(), depth_source ? m_vram_extract_depth_texture.get() : nullptr, 0, 0,
   3730                       scaled_display_width, read_height);
   3731     if (g_settings.display_24bit_chroma_smoothing)
   3732     {
   3733       if (ApplyChromaSmoothing())
   3734       {
   3735         if (interlaced)
   3736           Deinterlace(interlaced_field, 0);
   3737       }
   3738     }
   3739     else
   3740     {
   3741       if (interlaced)
   3742         Deinterlace(interlaced_field, 0);
   3743     }
   3744   }
   3745 
   3746   if (m_downsample_mode != GPUDownsampleMode::Disabled && !m_GPUSTAT.display_area_color_depth_24)
   3747   {
   3748     DebugAssert(m_display_texture);
   3749     DownsampleFramebuffer();
   3750   }
   3751 
   3752   if (drew_anything)
   3753     RestoreDeviceContext();
   3754 }
   3755 
   3756 void GPU_HW::UpdateDownsamplingLevels()
   3757 {
   3758   if (m_downsample_mode == GPUDownsampleMode::Adaptive)
   3759   {
   3760     m_downsample_scale_or_levels = 0;
   3761     u32 current_width = VRAM_WIDTH * m_resolution_scale;
   3762     while (current_width >= VRAM_WIDTH)
   3763     {
   3764       m_downsample_scale_or_levels++;
   3765       current_width /= 2;
   3766     }
   3767   }
   3768   else if (m_downsample_mode == GPUDownsampleMode::Box)
   3769   {
   3770     m_downsample_scale_or_levels = m_resolution_scale / GetBoxDownsampleScale(m_resolution_scale);
   3771   }
   3772   else
   3773   {
   3774     m_downsample_scale_or_levels = 0;
   3775   }
   3776 
   3777   // Toss downsampling buffer, it's likely going to change resolution.
   3778   g_gpu_device->RecycleTexture(std::move(m_downsample_texture));
   3779 }
   3780 
   3781 void GPU_HW::OnBufferSwapped()
   3782 {
   3783   GL_INS("OnBufferSwapped()");
   3784   m_depth_was_copied = false;
   3785 }
   3786 
   3787 void GPU_HW::DownsampleFramebuffer()
   3788 {
   3789   GPUTexture* source = m_display_texture;
   3790   const u32 left = m_display_texture_view_x;
   3791   const u32 top = m_display_texture_view_y;
   3792   const u32 width = m_display_texture_view_width;
   3793   const u32 height = m_display_texture_view_height;
   3794 
   3795   if (m_downsample_mode == GPUDownsampleMode::Adaptive)
   3796     DownsampleFramebufferAdaptive(source, left, top, width, height);
   3797   else
   3798     DownsampleFramebufferBoxFilter(source, left, top, width, height);
   3799 }
   3800 
   3801 void GPU_HW::DownsampleFramebufferAdaptive(GPUTexture* source, u32 left, u32 top, u32 width, u32 height)
   3802 {
   3803   GL_PUSH_FMT("DownsampleFramebufferAdaptive ({},{} => {},{})", left, top, left + width, left + height);
   3804 
   3805   struct SmoothingUBOData
   3806   {
   3807     float min_uv[2];
   3808     float max_uv[2];
   3809     float rcp_size[2];
   3810     float lod;
   3811   };
   3812 
   3813   if (!m_downsample_texture || m_downsample_texture->GetWidth() != width || m_downsample_texture->GetHeight() != height)
   3814   {
   3815     g_gpu_device->RecycleTexture(std::move(m_downsample_texture));
   3816     m_downsample_texture =
   3817       g_gpu_device->FetchTexture(width, height, 1, 1, 1, GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT);
   3818   }
   3819   std::unique_ptr<GPUTexture, GPUDevice::PooledTextureDeleter> level_texture = g_gpu_device->FetchAutoRecycleTexture(
   3820     width, height, 1, m_downsample_scale_or_levels, 1, GPUTexture::Type::Texture, VRAM_RT_FORMAT);
   3821   std::unique_ptr<GPUTexture, GPUDevice::PooledTextureDeleter> weight_texture =
   3822     g_gpu_device->FetchAutoRecycleTexture(std::max(width >> (m_downsample_scale_or_levels - 1), 1u),
   3823                                           std::max(height >> (m_downsample_scale_or_levels - 1), 1u), 1, 1, 1,
   3824                                           GPUTexture::Type::RenderTarget, GPUTexture::Format::R8);
   3825   if (!m_downsample_texture || !level_texture || !weight_texture)
   3826   {
   3827     ERROR_LOG("Failed to create {}x{} RTs for adaptive downsampling", width, height);
   3828     return;
   3829   }
   3830 
   3831   g_gpu_device->CopyTextureRegion(level_texture.get(), 0, 0, 0, 0, source, left, top, 0, 0, width, height);
   3832   g_gpu_device->SetTextureSampler(0, level_texture.get(), m_downsample_lod_sampler.get());
   3833 
   3834   SmoothingUBOData uniforms;
   3835 
   3836   // create mip chain
   3837   for (u32 level = 1; level < m_downsample_scale_or_levels; level++)
   3838   {
   3839     GL_SCOPE_FMT("Create miplevel {}", level);
   3840 
   3841     const u32 level_width = width >> level;
   3842     const u32 level_height = height >> level;
   3843     const float rcp_width = 1.0f / static_cast<float>(level_texture->GetMipWidth(level));
   3844     const float rcp_height = 1.0f / static_cast<float>(level_texture->GetMipHeight(level));
   3845     uniforms.min_uv[0] = 0.0f;
   3846     uniforms.min_uv[1] = 0.0f;
   3847     uniforms.max_uv[0] = static_cast<float>(level_width) * rcp_width;
   3848     uniforms.max_uv[1] = static_cast<float>(level_height) * rcp_height;
   3849     uniforms.rcp_size[0] = rcp_width;
   3850     uniforms.rcp_size[1] = rcp_height;
   3851     uniforms.lod = static_cast<float>(level - 1);
   3852 
   3853     g_gpu_device->InvalidateRenderTarget(m_downsample_texture.get());
   3854     g_gpu_device->SetRenderTarget(m_downsample_texture.get());
   3855     g_gpu_device->SetViewportAndScissor(GSVector4i(0, 0, level_width, level_height));
   3856     g_gpu_device->SetPipeline((level == 1) ? m_downsample_first_pass_pipeline.get() :
   3857                                              m_downsample_mid_pass_pipeline.get());
   3858     g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
   3859     g_gpu_device->Draw(3, 0);
   3860     g_gpu_device->CopyTextureRegion(level_texture.get(), 0, 0, 0, level, m_downsample_texture.get(), 0, 0, 0, 0,
   3861                                     level_width, level_height);
   3862   }
   3863 
   3864   // blur pass at lowest level
   3865   {
   3866     GL_SCOPE("Blur");
   3867 
   3868     const u32 last_level = m_downsample_scale_or_levels - 1;
   3869     const u32 last_width = level_texture->GetMipWidth(last_level);
   3870     const u32 last_height = level_texture->GetMipHeight(last_level);
   3871     const float rcp_width = 1.0f / static_cast<float>(m_downsample_texture->GetWidth());
   3872     const float rcp_height = 1.0f / static_cast<float>(m_downsample_texture->GetHeight());
   3873     uniforms.min_uv[0] = 0.0f;
   3874     uniforms.min_uv[1] = 0.0f;
   3875     uniforms.max_uv[0] = static_cast<float>(last_width) * rcp_width;
   3876     uniforms.max_uv[1] = static_cast<float>(last_height) * rcp_height;
   3877     uniforms.rcp_size[0] = rcp_width;
   3878     uniforms.rcp_size[1] = rcp_height;
   3879     uniforms.lod = 0.0f;
   3880 
   3881     m_downsample_texture->MakeReadyForSampling();
   3882     g_gpu_device->InvalidateRenderTarget(weight_texture.get());
   3883     g_gpu_device->SetRenderTarget(weight_texture.get());
   3884     g_gpu_device->SetTextureSampler(0, m_downsample_texture.get(), g_gpu_device->GetNearestSampler());
   3885     g_gpu_device->SetViewportAndScissor(GSVector4i(0, 0, last_width, last_height));
   3886     g_gpu_device->SetPipeline(m_downsample_blur_pass_pipeline.get());
   3887     g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
   3888     g_gpu_device->Draw(3, 0);
   3889     weight_texture->MakeReadyForSampling();
   3890   }
   3891 
   3892   // composite downsampled and upsampled images together
   3893   {
   3894     GL_SCOPE("Composite");
   3895 
   3896     uniforms.min_uv[0] = 0.0f;
   3897     uniforms.min_uv[1] = 0.0f;
   3898     uniforms.max_uv[0] = 1.0f;
   3899     uniforms.max_uv[1] = 1.0f;
   3900 
   3901     g_gpu_device->InvalidateRenderTarget(m_downsample_texture.get());
   3902     g_gpu_device->SetRenderTarget(m_downsample_texture.get());
   3903     g_gpu_device->SetTextureSampler(0, level_texture.get(), m_downsample_composite_sampler.get());
   3904     g_gpu_device->SetTextureSampler(1, weight_texture.get(), m_downsample_lod_sampler.get());
   3905     g_gpu_device->SetViewportAndScissor(GSVector4i(0, 0, width, height));
   3906     g_gpu_device->SetPipeline(m_downsample_composite_pass_pipeline.get());
   3907     g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
   3908     g_gpu_device->Draw(3, 0);
   3909     m_downsample_texture->MakeReadyForSampling();
   3910   }
   3911 
   3912   GL_POP();
   3913 
   3914   RestoreDeviceContext();
   3915 
   3916   SetDisplayTexture(m_downsample_texture.get(), m_display_depth_buffer, 0, 0, width, height);
   3917 }
   3918 
   3919 void GPU_HW::DownsampleFramebufferBoxFilter(GPUTexture* source, u32 left, u32 top, u32 width, u32 height)
   3920 {
   3921   GL_SCOPE_FMT("DownsampleFramebufferBoxFilter({},{} => {},{} ({}x{})", left, top, left + width, top + height, width,
   3922                height);
   3923 
   3924   const u32 ds_width = width / m_downsample_scale_or_levels;
   3925   const u32 ds_height = height / m_downsample_scale_or_levels;
   3926 
   3927   if (!m_downsample_texture || m_downsample_texture->GetWidth() != ds_width ||
   3928       m_downsample_texture->GetHeight() != ds_height)
   3929   {
   3930     g_gpu_device->RecycleTexture(std::move(m_downsample_texture));
   3931     m_downsample_texture =
   3932       g_gpu_device->FetchTexture(ds_width, ds_height, 1, 1, 1, GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT);
   3933   }
   3934   if (!m_downsample_texture)
   3935   {
   3936     ERROR_LOG("Failed to create {}x{} RT for box downsampling", width, height);
   3937     return;
   3938   }
   3939 
   3940   source->MakeReadyForSampling();
   3941 
   3942   const u32 uniforms[4] = {left, top, 0u, 0u};
   3943 
   3944   g_gpu_device->InvalidateRenderTarget(m_downsample_texture.get());
   3945   g_gpu_device->SetRenderTarget(m_downsample_texture.get());
   3946   g_gpu_device->SetPipeline(m_downsample_first_pass_pipeline.get());
   3947   g_gpu_device->SetTextureSampler(0, source, g_gpu_device->GetNearestSampler());
   3948   g_gpu_device->SetViewportAndScissor(0, 0, ds_width, ds_height);
   3949   g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms));
   3950   g_gpu_device->Draw(3, 0);
   3951 
   3952   RestoreDeviceContext();
   3953 
   3954   SetDisplayTexture(m_downsample_texture.get(), m_display_depth_buffer, 0, 0, ds_width, ds_height);
   3955 }
   3956 
   3957 void GPU_HW::DrawRendererStats()
   3958 {
   3959   if (ImGui::CollapsingHeader("Renderer Statistics", ImGuiTreeNodeFlags_DefaultOpen))
   3960   {
   3961     static const ImVec4 active_color{1.0f, 1.0f, 1.0f, 1.0f};
   3962     static const ImVec4 inactive_color{0.4f, 0.4f, 0.4f, 1.0f};
   3963 
   3964     ImGui::Columns(2);
   3965     ImGui::SetColumnWidth(0, 200.0f * ImGuiManager::GetGlobalScale());
   3966 
   3967     ImGui::TextUnformatted("Resolution Scale:");
   3968     ImGui::NextColumn();
   3969     ImGui::Text("%u (VRAM %ux%u)", m_resolution_scale, VRAM_WIDTH * m_resolution_scale,
   3970                 VRAM_HEIGHT * m_resolution_scale);
   3971     ImGui::NextColumn();
   3972 
   3973     ImGui::TextUnformatted("Effective Display Resolution:");
   3974     ImGui::NextColumn();
   3975     ImGui::Text("%ux%u", m_crtc_state.display_vram_width * m_resolution_scale,
   3976                 m_crtc_state.display_vram_height * m_resolution_scale);
   3977     ImGui::NextColumn();
   3978 
   3979     ImGui::TextUnformatted("True Color:");
   3980     ImGui::NextColumn();
   3981     ImGui::TextColored(m_true_color ? active_color : inactive_color, m_true_color ? "Enabled" : "Disabled");
   3982     ImGui::NextColumn();
   3983 
   3984     const bool debanding = (g_settings.gpu_true_color && g_settings.gpu_debanding);
   3985     ImGui::TextUnformatted("Debanding:");
   3986     ImGui::NextColumn();
   3987     ImGui::TextColored(debanding ? active_color : inactive_color, debanding ? "Enabled" : "Disabled");
   3988     ImGui::NextColumn();
   3989 
   3990     const bool scaled_dithering = (m_resolution_scale > 1 && g_settings.gpu_scaled_dithering);
   3991     ImGui::TextUnformatted("Scaled Dithering:");
   3992     ImGui::NextColumn();
   3993     ImGui::TextColored(scaled_dithering ? active_color : inactive_color, scaled_dithering ? "Enabled" : "Disabled");
   3994     ImGui::NextColumn();
   3995 
   3996     ImGui::TextUnformatted("Texture Filtering:");
   3997     ImGui::NextColumn();
   3998     ImGui::TextColored((m_texture_filtering != GPUTextureFilter::Nearest) ? active_color : inactive_color, "%s",
   3999                        Settings::GetTextureFilterDisplayName(m_texture_filtering));
   4000     ImGui::NextColumn();
   4001 
   4002     ImGui::TextUnformatted("PGXP:");
   4003     ImGui::NextColumn();
   4004     ImGui::TextColored(g_settings.gpu_pgxp_enable ? active_color : inactive_color, "Geom");
   4005     ImGui::SameLine();
   4006     ImGui::TextColored((g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling) ? active_color : inactive_color,
   4007                        "Cull");
   4008     ImGui::SameLine();
   4009     ImGui::TextColored(
   4010       (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_texture_correction) ? active_color : inactive_color, "Tex");
   4011     ImGui::SameLine();
   4012     ImGui::TextColored((g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_vertex_cache) ? active_color : inactive_color,
   4013                        "Cache");
   4014     ImGui::NextColumn();
   4015 
   4016     ImGui::Columns(1);
   4017   }
   4018 }
   4019 
   4020 std::unique_ptr<GPU> GPU::CreateHardwareRenderer()
   4021 {
   4022   std::unique_ptr<GPU_HW> gpu(std::make_unique<GPU_HW>());
   4023   if (!gpu->Initialize())
   4024     return nullptr;
   4025 
   4026   return gpu;
   4027 }