duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

gpu_hw_shadergen.cpp (62170B)


      1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "gpu_hw_shadergen.h"
      5 
      6 #include "common/assert.h"
      7 
      8 GPU_HW_ShaderGen::GPU_HW_ShaderGen(RenderAPI render_api, u32 resolution_scale, u32 multisamples,
      9                                    bool per_sample_shading, bool true_color, bool scaled_dithering,
     10                                    bool write_mask_as_depth, bool disable_color_perspective,
     11                                    bool supports_dual_source_blend, bool supports_framebuffer_fetch, bool debanding)
     12   : ShaderGen(render_api, GetShaderLanguageForAPI(render_api), supports_dual_source_blend, supports_framebuffer_fetch),
     13     m_resolution_scale(resolution_scale), m_multisamples(multisamples), m_per_sample_shading(per_sample_shading),
     14     m_true_color(true_color), m_scaled_dithering(scaled_dithering), m_write_mask_as_depth(write_mask_as_depth),
     15     m_disable_color_perspective(disable_color_perspective), m_debanding(debanding)
     16 {
     17 }
     18 
     19 GPU_HW_ShaderGen::~GPU_HW_ShaderGen() = default;
     20 
     21 void GPU_HW_ShaderGen::WriteCommonFunctions(std::stringstream& ss)
     22 {
     23   DefineMacro(ss, "MULTISAMPLING", UsingMSAA());
     24 
     25   ss << "CONSTANT uint RESOLUTION_SCALE = " << m_resolution_scale << "u;\n";
     26   ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n";
     27   ss << "CONSTANT uint MULTISAMPLES = " << m_multisamples << "u;\n";
     28   ss << "CONSTANT bool PER_SAMPLE_SHADING = " << (m_per_sample_shading ? "true" : "false") << ";\n";
     29   ss << R"(
     30 uint RGBA8ToRGBA5551(float4 v)
     31 {
     32   uint r = uint(roundEven(v.r * 31.0));
     33   uint g = uint(roundEven(v.g * 31.0));
     34   uint b = uint(roundEven(v.b * 31.0));
     35   uint a = (v.a != 0.0) ? 1u : 0u;
     36   return (r) | (g << 5) | (b << 10) | (a << 15);
     37 }
     38 
     39 float4 RGBA5551ToRGBA8(uint v)
     40 {
     41   uint r = (v & 31u);
     42   uint g = ((v >> 5) & 31u);
     43   uint b = ((v >> 10) & 31u);
     44   uint a = ((v >> 15) & 1u);
     45 
     46   return float4(float(r) / 31.0, float(g) / 31.0, float(b) / 31.0, float(a));
     47 }
     48 )";
     49 }
     50 
     51 void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss)
     52 {
     53   DeclareUniformBuffer(ss,
     54                        {"uint2 u_texture_window_and", "uint2 u_texture_window_or", "float u_src_alpha_factor",
     55                         "float u_dst_alpha_factor", "uint u_interlaced_displayed_field",
     56                         "bool u_set_mask_while_drawing"},
     57                        false);
     58 }
     59 
     60 std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool palette, bool uv_limits,
     61                                                         bool force_round_texcoords, bool pgxp_depth)
     62 {
     63   std::stringstream ss;
     64   WriteHeader(ss);
     65   DefineMacro(ss, "TEXTURED", textured);
     66   DefineMacro(ss, "PALETTE", palette);
     67   DefineMacro(ss, "UV_LIMITS", uv_limits);
     68   DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords);
     69   DefineMacro(ss, "PGXP_DEPTH", pgxp_depth);
     70 
     71   WriteCommonFunctions(ss);
     72   WriteBatchUniformBuffer(ss);
     73 
     74   if (textured)
     75   {
     76     if (uv_limits)
     77     {
     78       DeclareVertexEntryPoint(
     79         ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1,
     80         {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"},
     81          {"nointerpolation", "float4 v_uv_limits"}},
     82         false, "", UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective);
     83     }
     84     else
     85     {
     86       DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1,
     87                               {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, false, "",
     88                               UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective);
     89     }
     90   }
     91   else
     92   {
     93     DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0"}, 1, 0, {}, false, "", UsingMSAA(),
     94                             UsingPerSampleShading(), m_disable_color_perspective);
     95   }
     96 
     97   ss << R"(
     98 {
     99   // Offset the vertex position by 0.5 to ensure correct interpolation of texture coordinates
    100   // at 1x resolution scale. This doesn't work at >1x, we adjust the texture coordinates before
    101   // uploading there instead.
    102   float vertex_offset = (RESOLUTION_SCALE == 1u) ? 0.5 : 0.0;
    103 
    104   // 0..+1023 -> -1..1
    105   float pos_x = ((a_pos.x + vertex_offset) / 512.0) - 1.0;
    106   float pos_y = ((a_pos.y + vertex_offset) / -256.0) + 1.0;
    107 
    108 #if PGXP_DEPTH
    109   // Ignore mask Z when using PGXP depth.
    110   float pos_z = a_pos.w;
    111   float pos_w = a_pos.w;
    112 #else
    113   float pos_z = a_pos.z;
    114   float pos_w = a_pos.w;
    115 #endif
    116 
    117 #if API_OPENGL || API_OPENGL_ES
    118   // 0..1 to -1..1 depth range.
    119   pos_z = (pos_z * 2.0) - 1.0;
    120 #endif
    121 
    122   // NDC space Y flip in Vulkan.
    123 #if API_OPENGL || API_OPENGL_ES || API_VULKAN
    124   pos_y = -pos_y;
    125 #endif
    126 
    127   v_pos = float4(pos_x * pos_w, pos_y * pos_w, pos_z * pos_w, pos_w);
    128 
    129   v_col0 = a_col0;
    130   #if TEXTURED
    131     v_tex0 = float2(uint2(a_texcoord & 0xFFFFu, a_texcoord >> 16));
    132     #if !PALETTE
    133       v_tex0 *= float(RESOLUTION_SCALE);
    134     #endif
    135 
    136     // base_x,base_y,palette_x,palette_y
    137     v_texpage.x = (a_texpage & 15u) * 64u;
    138     v_texpage.y = ((a_texpage >> 4) & 1u) * 256u;
    139     #if PALETTE
    140       v_texpage.z = ((a_texpage >> 16) & 63u) * 16u;
    141       v_texpage.w = ((a_texpage >> 22) & 511u);
    142     #endif
    143 
    144     #if UV_LIMITS
    145       v_uv_limits = a_uv_limits * 255.0;
    146 
    147       #if FORCE_ROUND_TEXCOORDS && PALETTE
    148         // Add 0.5 to the upper bounds when upscaling, to work around interpolation differences.
    149         // Limited to force-round-texcoord hack, to avoid breaking other games.
    150         v_uv_limits.zw += 0.5;
    151       #elif !PALETTE
    152         // Treat coordinates as being in upscaled space, and extend the UV range to all "upscaled"
    153         // pixels. This means 1-pixel-high polygon-based framebuffer effects won't be downsampled.
    154         // (e.g. Mega Man Legends 2 haze effect)
    155         v_uv_limits *= float(RESOLUTION_SCALE);
    156         v_uv_limits.zw += float(RESOLUTION_SCALE - 1u);
    157       #endif
    158     #endif
    159   #endif
    160 }
    161 )";
    162 
    163   return ss.str();
    164 }
    165 
    166 void GPU_HW_ShaderGen::WriteBatchTextureFilter(std::stringstream& ss, GPUTextureFilter texture_filter)
    167 {
    168   // JINC2 and xBRZ shaders originally from beetle-psx, modified to support filtering mask channel.
    169   if (texture_filter == GPUTextureFilter::Bilinear || texture_filter == GPUTextureFilter::BilinearBinAlpha)
    170   {
    171     DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::BilinearBinAlpha);
    172     ss << R"(
    173 void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
    174                             out float4 texcol, out float ialpha)
    175 {
    176   // Compute the coordinates of the four texels we will be interpolating between.
    177   // Clamp this to the triangle texture coordinates.
    178   float2 texel_top_left = frac(coords) - float2(0.5, 0.5);
    179   float2 texel_offset = sign(texel_top_left);
    180   float4 fcoords = max(coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y),
    181                         float4(0.0, 0.0, 0.0, 0.0));
    182 
    183   // Load four texels.
    184   float4 s00 = SampleFromVRAM(texpage, clamp(fcoords.xy, uv_limits.xy, uv_limits.zw));
    185   float4 s10 = SampleFromVRAM(texpage, clamp(fcoords.zy, uv_limits.xy, uv_limits.zw));
    186   float4 s01 = SampleFromVRAM(texpage, clamp(fcoords.xw, uv_limits.xy, uv_limits.zw));
    187   float4 s11 = SampleFromVRAM(texpage, clamp(fcoords.zw, uv_limits.xy, uv_limits.zw));
    188 
    189   // Compute alpha from how many texels aren't pixel color 0000h.
    190   float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR));
    191   float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR));
    192   float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR));
    193   float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR));
    194 
    195   // Bilinearly interpolate.
    196   float2 weights = abs(texel_top_left);
    197   texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y);
    198   ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y);
    199 
    200   // Compensate for partially transparent sampling.
    201   if (ialpha > 0.0)
    202     texcol.rgb /= float3(ialpha, ialpha, ialpha);
    203 
    204 #if BINALPHA
    205   ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
    206 #endif
    207 }
    208 )";
    209   }
    210   else if (texture_filter == GPUTextureFilter::JINC2 || texture_filter == GPUTextureFilter::JINC2BinAlpha)
    211   {
    212     /*
    213        Hyllian's jinc windowed-jinc 2-lobe sharper with anti-ringing Shader
    214 
    215        Copyright (C) 2011-2016 Hyllian/Jararaca - sergiogdb@gmail.com
    216 
    217        Permission is hereby granted, free of charge, to any person obtaining a copy
    218        of this software and associated documentation files (the "Software"), to deal
    219        in the Software without restriction, including without limitation the rights
    220        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    221        copies of the Software, and to permit persons to whom the Software is
    222        furnished to do so, subject to the following conditions:
    223 
    224        The above copyright notice and this permission notice shall be included in
    225        all copies or substantial portions of the Software.
    226 
    227        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    228        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    229        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    230        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    231        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    232        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    233        THE SOFTWARE.
    234     */
    235     DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::JINC2BinAlpha);
    236     ss << R"(
    237 CONSTANT float JINC2_WINDOW_SINC = 0.44;
    238 CONSTANT float JINC2_SINC = 0.82;
    239 CONSTANT float JINC2_AR_STRENGTH = 0.8;
    240 
    241 CONSTANT   float halfpi            = 1.5707963267948966192313216916398;
    242 CONSTANT   float pi                = 3.1415926535897932384626433832795;
    243 CONSTANT   float wa                = 1.382300768;
    244 CONSTANT   float wb                = 2.576105976;
    245 
    246 // Calculates the distance between two points
    247 float d(float2 pt1, float2 pt2)
    248 {
    249   float2 v = pt2 - pt1;
    250   return sqrt(dot(v,v));
    251 }
    252 
    253 float min4(float a, float b, float c, float d)
    254 {
    255     return min(a, min(b, min(c, d)));
    256 }
    257 
    258 float4 min4(float4 a, float4 b, float4 c, float4 d)
    259 {
    260     return min(a, min(b, min(c, d)));
    261 }
    262 
    263 float max4(float a, float b, float c, float d)
    264 {
    265   return max(a, max(b, max(c, d)));
    266 }
    267 
    268 float4 max4(float4 a, float4 b, float4 c, float4 d)
    269 {
    270     return max(a, max(b, max(c, d)));
    271 }
    272 
    273 float4 resampler(float4 x)
    274 {
    275    float4 res;
    276 
    277    // res = (x==float4(0.0, 0.0, 0.0, 0.0)) ?  float4(wa*wb)  :  sin(x*wa)*sin(x*wb)/(x*x);
    278    // Need to use mix(.., equal(..)) since we want zero check to be component wise
    279    res = lerp(sin(x*wa)*sin(x*wb)/(x*x), float4(wa*wb, wa*wb, wa*wb, wa*wb), VECTOR_COMP_EQ(x,float4(0.0, 0.0, 0.0, 0.0)));
    280 
    281    return res;
    282 }
    283 
    284 void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
    285                             out float4 texcol, out float ialpha)
    286 {
    287     float4 weights[4];
    288 
    289     float2 dx = float2(1.0, 0.0);
    290     float2 dy = float2(0.0, 1.0);
    291 
    292     float2 pc = coords.xy;
    293 
    294     float2 tc = (floor(pc-float2(0.5,0.5))+float2(0.5,0.5));
    295 
    296     weights[0] = resampler(float4(d(pc, tc    -dx    -dy), d(pc, tc           -dy), d(pc, tc    +dx    -dy), d(pc, tc+2.0*dx    -dy)));
    297     weights[1] = resampler(float4(d(pc, tc    -dx       ), d(pc, tc              ), d(pc, tc    +dx       ), d(pc, tc+2.0*dx       )));
    298     weights[2] = resampler(float4(d(pc, tc    -dx    +dy), d(pc, tc           +dy), d(pc, tc    +dx    +dy), d(pc, tc+2.0*dx    +dy)));
    299     weights[3] = resampler(float4(d(pc, tc    -dx+2.0*dy), d(pc, tc       +2.0*dy), d(pc, tc    +dx+2.0*dy), d(pc, tc+2.0*dx+2.0*dy)));
    300 
    301     dx = dx;
    302     dy = dy;
    303     tc = tc;
    304 
    305 #define sample_texel(coords) SampleFromVRAM(texpage, clamp((coords), uv_limits.xy, uv_limits.zw))
    306 
    307     float4 c00 = sample_texel(tc    -dx    -dy);
    308     float a00 = float(VECTOR_NEQ(c00, TRANSPARENT_PIXEL_COLOR));
    309     float4 c10 = sample_texel(tc           -dy);
    310     float a10 = float(VECTOR_NEQ(c10, TRANSPARENT_PIXEL_COLOR));
    311     float4 c20 = sample_texel(tc    +dx    -dy);
    312     float a20 = float(VECTOR_NEQ(c20, TRANSPARENT_PIXEL_COLOR));
    313     float4 c30 = sample_texel(tc+2.0*dx    -dy);
    314     float a30 = float(VECTOR_NEQ(c30, TRANSPARENT_PIXEL_COLOR));
    315     float4 c01 = sample_texel(tc    -dx       );
    316     float a01 = float(VECTOR_NEQ(c01, TRANSPARENT_PIXEL_COLOR));
    317     float4 c11 = sample_texel(tc              );
    318     float a11 = float(VECTOR_NEQ(c11, TRANSPARENT_PIXEL_COLOR));
    319     float4 c21 = sample_texel(tc    +dx       );
    320     float a21 = float(VECTOR_NEQ(c21, TRANSPARENT_PIXEL_COLOR));
    321     float4 c31 = sample_texel(tc+2.0*dx       );
    322     float a31 = float(VECTOR_NEQ(c31, TRANSPARENT_PIXEL_COLOR));
    323     float4 c02 = sample_texel(tc    -dx    +dy);
    324     float a02 = float(VECTOR_NEQ(c02, TRANSPARENT_PIXEL_COLOR));
    325     float4 c12 = sample_texel(tc           +dy);
    326     float a12 = float(VECTOR_NEQ(c12, TRANSPARENT_PIXEL_COLOR));
    327     float4 c22 = sample_texel(tc    +dx    +dy);
    328     float a22 = float(VECTOR_NEQ(c22, TRANSPARENT_PIXEL_COLOR));
    329     float4 c32 = sample_texel(tc+2.0*dx    +dy);
    330     float a32 = float(VECTOR_NEQ(c32, TRANSPARENT_PIXEL_COLOR));
    331     float4 c03 = sample_texel(tc    -dx+2.0*dy);
    332     float a03 = float(VECTOR_NEQ(c03, TRANSPARENT_PIXEL_COLOR));
    333     float4 c13 = sample_texel(tc       +2.0*dy);
    334     float a13 = float(VECTOR_NEQ(c13, TRANSPARENT_PIXEL_COLOR));
    335     float4 c23 = sample_texel(tc    +dx+2.0*dy);
    336     float a23 = float(VECTOR_NEQ(c23, TRANSPARENT_PIXEL_COLOR));
    337     float4 c33 = sample_texel(tc+2.0*dx+2.0*dy);
    338     float a33 = float(VECTOR_NEQ(c33, TRANSPARENT_PIXEL_COLOR));
    339 
    340 #undef sample_texel
    341 
    342     //  Get min/max samples
    343     float4 min_sample = min4(c11, c21, c12, c22);
    344     float min_sample_alpha = min4(a11, a21, a12, a22);
    345     float4 max_sample = max4(c11, c21, c12, c22);
    346     float max_sample_alpha = max4(a11, a21, a12, a22);
    347 
    348     float4 color;
    349     color = float4(dot(weights[0], float4(c00.x, c10.x, c20.x, c30.x)), dot(weights[0], float4(c00.y, c10.y, c20.y, c30.y)), dot(weights[0], float4(c00.z, c10.z, c20.z, c30.z)), dot(weights[0], float4(c00.w, c10.w, c20.w, c30.w)));
    350     color+= float4(dot(weights[1], float4(c01.x, c11.x, c21.x, c31.x)), dot(weights[1], float4(c01.y, c11.y, c21.y, c31.y)), dot(weights[1], float4(c01.z, c11.z, c21.z, c31.z)), dot(weights[1], float4(c01.w, c11.w, c21.w, c31.w)));
    351     color+= float4(dot(weights[2], float4(c02.x, c12.x, c22.x, c32.x)), dot(weights[2], float4(c02.y, c12.y, c22.y, c32.y)), dot(weights[2], float4(c02.z, c12.z, c22.z, c32.z)), dot(weights[2], float4(c02.w, c12.w, c22.w, c32.w)));
    352     color+= float4(dot(weights[3], float4(c03.x, c13.x, c23.x, c33.x)), dot(weights[3], float4(c03.y, c13.y, c23.y, c33.y)), dot(weights[3], float4(c03.z, c13.z, c23.z, c33.z)), dot(weights[3], float4(c03.w, c13.w, c23.w, c33.w)));
    353     color = color/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1)));
    354 
    355     float alpha;
    356     alpha = dot(weights[0], float4(a00, a10, a20, a30));
    357     alpha+= dot(weights[1], float4(a01, a11, a21, a31));
    358     alpha+= dot(weights[2], float4(a02, a12, a22, a32));
    359     alpha+= dot(weights[3], float4(a03, a13, a23, a33));
    360     //alpha = alpha/(weights[0].w + weights[1].w + weights[2].w + weights[3].w);
    361     alpha = alpha/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1)));
    362 
    363     // Anti-ringing
    364     float4 aux = color;
    365     float aux_alpha = alpha;
    366     color = clamp(color, min_sample, max_sample);
    367     alpha = clamp(alpha, min_sample_alpha, max_sample_alpha);
    368     color = lerp(aux, color, JINC2_AR_STRENGTH);
    369     alpha = lerp(aux_alpha, alpha, JINC2_AR_STRENGTH);
    370 
    371     // final sum and weight normalization
    372     ialpha = alpha;
    373     texcol = color;
    374 
    375     // Compensate for partially transparent sampling.
    376     if (ialpha > 0.0)
    377       texcol.rgb /= float3(ialpha, ialpha, ialpha);
    378 
    379 #if BINALPHA
    380   ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
    381 #endif
    382 }
    383 )";
    384   }
    385   else if (texture_filter == GPUTextureFilter::xBR || texture_filter == GPUTextureFilter::xBRBinAlpha)
    386   {
    387     /*
    388        Hyllian's xBR-vertex code and texel mapping
    389 
    390        Copyright (C) 2011/2016 Hyllian - sergiogdb@gmail.com
    391 
    392        Permission is hereby granted, free of charge, to any person obtaining a copy
    393        of this software and associated documentation files (the "Software"), to deal
    394        in the Software without restriction, including without limitation the rights
    395        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    396        copies of the Software, and to permit persons to whom the Software is
    397        furnished to do so, subject to the following conditions:
    398 
    399        The above copyright notice and this permission notice shall be included in
    400        all copies or substantial portions of the Software.
    401 
    402        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    403        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    404        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    405        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    406        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    407        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    408        THE SOFTWARE.
    409     */
    410 
    411     DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::xBRBinAlpha);
    412     ss << R"(
    413 CONSTANT int BLEND_NONE = 0;
    414 CONSTANT int BLEND_NORMAL = 1;
    415 CONSTANT int BLEND_DOMINANT = 2;
    416 CONSTANT float LUMINANCE_WEIGHT = 1.0;
    417 CONSTANT float EQUAL_COLOR_TOLERANCE = 0.1176470588235294;
    418 CONSTANT float STEEP_DIRECTION_THRESHOLD = 2.2;
    419 CONSTANT float DOMINANT_DIRECTION_THRESHOLD = 3.6;
    420 CONSTANT float4 w = float4(0.2627, 0.6780, 0.0593, 0.5);
    421 
    422 float DistYCbCr(float4 pixA, float4 pixB)
    423 {
    424   const float scaleB = 0.5 / (1.0 - w.b);
    425   const float scaleR = 0.5 / (1.0 - w.r);
    426   float4 diff = pixA - pixB;
    427   float Y = dot(diff, w);
    428   float Cb = scaleB * (diff.b - Y);
    429   float Cr = scaleR * (diff.r - Y);
    430 
    431   return sqrt(((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr));
    432 }
    433 
    434 bool IsPixEqual(const float4 pixA, const float4 pixB)
    435 {
    436   return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE);
    437 }
    438 
    439 float get_left_ratio(float2 center, float2 origin, float2 direction, float2 scale)
    440 {
    441   float2 P0 = center - origin;
    442   float2 proj = direction * (dot(P0, direction) / dot(direction, direction));
    443   float2 distv = P0 - proj;
    444   float2 orth = float2(-direction.y, direction.x);
    445   float side = sign(dot(P0, orth));
    446   float v = side * length(distv * scale);
    447 
    448 //  return step(0, v);
    449   return smoothstep(-sqrt(2.0)/2.0, sqrt(2.0)/2.0, v);
    450 }
    451 
    452 #define P(coord, xoffs, yoffs) SampleFromVRAM(texpage, clamp(coords + float2((xoffs), (yoffs)), uv_limits.xy, uv_limits.zw))
    453 
    454 void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
    455                             out float4 texcol, out float ialpha)
    456 {
    457   //---------------------------------------
    458   // Input Pixel Mapping:  -|x|x|x|-
    459   //                       x|A|B|C|x
    460   //                       x|D|E|F|x
    461   //                       x|G|H|I|x
    462   //                       -|x|x|x|-
    463 
    464   float2 scale = float2(8.0, 8.0);
    465   float2 pos = frac(coords.xy) - float2(0.5, 0.5);
    466   float2 coord = coords.xy - pos;
    467 
    468   float4 A = P(coord, -1,-1);
    469   float Aw = A.w;
    470   A.w = float(VECTOR_NEQ(A, TRANSPARENT_PIXEL_COLOR));
    471   float4 B = P(coord,  0,-1);
    472   float Bw = B.w;
    473   B.w = float(VECTOR_NEQ(B, TRANSPARENT_PIXEL_COLOR));
    474   float4 C = P(coord,  1,-1);
    475   float Cw = C.w;
    476   C.w = float(VECTOR_NEQ(C, TRANSPARENT_PIXEL_COLOR));
    477   float4 D = P(coord, -1, 0);
    478   float Dw = D.w;
    479   D.w = float(VECTOR_NEQ(D, TRANSPARENT_PIXEL_COLOR));
    480   float4 E = P(coord, 0, 0);
    481   float Ew = E.w;
    482   E.w = float(VECTOR_NEQ(E, TRANSPARENT_PIXEL_COLOR));
    483   float4 F = P(coord,  1, 0);
    484   float Fw = F.w;
    485   F.w = float(VECTOR_NEQ(F, TRANSPARENT_PIXEL_COLOR));
    486   float4 G = P(coord, -1, 1);
    487   float Gw = G.w;
    488   G.w = float(VECTOR_NEQ(G, TRANSPARENT_PIXEL_COLOR));
    489   float4 H = P(coord,  0, 1);
    490   float Hw = H.w;
    491   H.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR));
    492   float4 I = P(coord,  1, 1);
    493   float Iw = I.w;
    494   I.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR));
    495 
    496   // blendResult Mapping: x|y|
    497   //                      w|z|
    498   int4 blendResult = int4(BLEND_NONE,BLEND_NONE,BLEND_NONE,BLEND_NONE);
    499 
    500   // Preprocess corners
    501   // Pixel Tap Mapping: -|-|-|-|-
    502   //                    -|-|B|C|-
    503   //                    -|D|E|F|x
    504   //                    -|G|H|I|x
    505   //                    -|-|x|x|-
    506   if (!((VECTOR_EQ(E,F) && VECTOR_EQ(H,I)) || (VECTOR_EQ(E,H) && VECTOR_EQ(F,I))))
    507   {
    508     float dist_H_F = DistYCbCr(G, E) + DistYCbCr(E, C) + DistYCbCr(P(coord, 0,2), I) + DistYCbCr(I, P(coord, 2,0)) + (4.0 * DistYCbCr(H, F));
    509     float dist_E_I = DistYCbCr(D, H) + DistYCbCr(H, P(coord, 1,2)) + DistYCbCr(B, F) + DistYCbCr(F, P(coord, 2,1)) + (4.0 * DistYCbCr(E, I));
    510     bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_H_F) < dist_E_I;
    511     blendResult.z = ((dist_H_F < dist_E_I) && VECTOR_NEQ(E,F) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
    512   }
    513 
    514 
    515   // Pixel Tap Mapping: -|-|-|-|-
    516   //                    -|A|B|-|-
    517   //                    x|D|E|F|-
    518   //                    x|G|H|I|-
    519   //                    -|x|x|-|-
    520   if (!((VECTOR_EQ(D,E) && VECTOR_EQ(G,H)) || (VECTOR_EQ(D,G) && VECTOR_EQ(E,H))))
    521   {
    522     float dist_G_E = DistYCbCr(P(coord, -2,1)  , D) + DistYCbCr(D, B) + DistYCbCr(P(coord, -1,2), H) + DistYCbCr(H, F) + (4.0 * DistYCbCr(G, E));
    523     float dist_D_H = DistYCbCr(P(coord, -2,0)  , G) + DistYCbCr(G, P(coord, 0,2)) + DistYCbCr(A, E) + DistYCbCr(E, I) + (4.0 * DistYCbCr(D, H));
    524     bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_H) < dist_G_E;
    525     blendResult.w = ((dist_G_E > dist_D_H) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
    526   }
    527 
    528   // Pixel Tap Mapping: -|-|x|x|-
    529   //                    -|A|B|C|x
    530   //                    -|D|E|F|x
    531   //                    -|-|H|I|-
    532   //                    -|-|-|-|-
    533   if (!((VECTOR_EQ(B,C) && VECTOR_EQ(E,F)) || (VECTOR_EQ(B,E) && VECTOR_EQ(C,F))))
    534   {
    535     float dist_E_C = DistYCbCr(D, B) + DistYCbCr(B, P(coord, 1,-2)) + DistYCbCr(H, F) + DistYCbCr(F, P(coord, 2,-1)) + (4.0 * DistYCbCr(E, C));
    536     float dist_B_F = DistYCbCr(A, E) + DistYCbCr(E, I) + DistYCbCr(P(coord, 0,-2), C) + DistYCbCr(C, P(coord, 2,0)) + (4.0 * DistYCbCr(B, F));
    537     bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_B_F) < dist_E_C;
    538     blendResult.y = ((dist_E_C > dist_B_F) && VECTOR_NEQ(E,B) && VECTOR_NEQ(E,F)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
    539   }
    540 
    541   // Pixel Tap Mapping: -|x|x|-|-
    542   //                    x|A|B|C|-
    543   //                    x|D|E|F|-
    544   //                    -|G|H|-|-
    545   //                    -|-|-|-|-
    546   if (!((VECTOR_EQ(A,B) && VECTOR_EQ(D,E)) || (VECTOR_EQ(A,D) && VECTOR_EQ(B,E))))
    547   {
    548     float dist_D_B = DistYCbCr(P(coord, -2,0), A) + DistYCbCr(A, P(coord, 0,-2)) + DistYCbCr(G, E) + DistYCbCr(E, C) + (4.0 * DistYCbCr(D, B));
    549     float dist_A_E = DistYCbCr(P(coord, -2,-1), D) + DistYCbCr(D, H) + DistYCbCr(P(coord, -1,-2), B) + DistYCbCr(B, F) + (4.0 * DistYCbCr(A, E));
    550     bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_B) < dist_A_E;
    551     blendResult.x = ((dist_D_B < dist_A_E) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,B)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
    552   }
    553 
    554   float4 res = E;
    555   float resW = Ew;
    556 
    557   // Pixel Tap Mapping: -|-|-|-|-
    558   //                    -|-|B|C|-
    559   //                    -|D|E|F|x
    560   //                    -|G|H|I|x
    561   //                    -|-|x|x|-
    562   if(blendResult.z != BLEND_NONE)
    563   {
    564     float dist_F_G = DistYCbCr(F, G);
    565     float dist_H_C = DistYCbCr(H, C);
    566     bool doLineBlend = (blendResult.z == BLEND_DOMINANT ||
    567                 !((blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) || (blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) ||
    568                   (IsPixEqual(G, H) && IsPixEqual(H, I) && IsPixEqual(I, F) && IsPixEqual(F, C) && !IsPixEqual(E, I))));
    569 
    570     float2 origin = float2(0.0, 1.0 / sqrt(2.0));
    571     float2 direction = float2(1.0, -1.0);
    572     if(doLineBlend)
    573     {
    574       bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_F_G <= dist_H_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(D,G);
    575       bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_H_C <= dist_F_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(B,C);
    576       origin = haveShallowLine? float2(0.0, 0.25) : float2(0.0, 0.5);
    577       direction.x += haveShallowLine? 1.0: 0.0;
    578       direction.y -= haveSteepLine? 1.0: 0.0;
    579     }
    580 
    581     float4 blendPix = lerp(H,F, step(DistYCbCr(E, F), DistYCbCr(E, H)));
    582     float blendW = lerp(Hw,Fw, step(DistYCbCr(E, F), DistYCbCr(E, H)));
    583     res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
    584     resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
    585   }
    586 
    587   // Pixel Tap Mapping: -|-|-|-|-
    588   //                    -|A|B|-|-
    589   //                    x|D|E|F|-
    590   //                    x|G|H|I|-
    591   //                    -|x|x|-|-
    592   if(blendResult.w != BLEND_NONE)
    593   {
    594     float dist_H_A = DistYCbCr(H, A);
    595     float dist_D_I = DistYCbCr(D, I);
    596     bool doLineBlend = (blendResult.w == BLEND_DOMINANT ||
    597                 !((blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) || (blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) ||
    598                   (IsPixEqual(A, D) && IsPixEqual(D, G) && IsPixEqual(G, H) && IsPixEqual(H, I) && !IsPixEqual(E, G))));
    599 
    600     float2 origin = float2(-1.0 / sqrt(2.0), 0.0);
    601     float2 direction = float2(1.0, 1.0);
    602     if(doLineBlend)
    603     {
    604       bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_H_A <= dist_D_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(B,A);
    605       bool haveSteepLine  = (STEEP_DIRECTION_THRESHOLD * dist_D_I <= dist_H_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(F,I);
    606       origin = haveShallowLine? float2(-0.25, 0.0) : float2(-0.5, 0.0);
    607       direction.y += haveShallowLine? 1.0: 0.0;
    608       direction.x += haveSteepLine? 1.0: 0.0;
    609     }
    610     origin = origin;
    611     direction = direction;
    612 
    613     float4 blendPix = lerp(H,D, step(DistYCbCr(E, D), DistYCbCr(E, H)));
    614     float blendW = lerp(Hw,Dw, step(DistYCbCr(E, D), DistYCbCr(E, H)));
    615     res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
    616     resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
    617   }
    618 
    619   // Pixel Tap Mapping: -|-|x|x|-
    620   //                    -|A|B|C|x
    621   //                    -|D|E|F|x
    622   //                    -|-|H|I|-
    623   //                    -|-|-|-|-
    624   if(blendResult.y != BLEND_NONE)
    625   {
    626     float dist_B_I = DistYCbCr(B, I);
    627     float dist_F_A = DistYCbCr(F, A);
    628     bool doLineBlend = (blendResult.y == BLEND_DOMINANT ||
    629                 !((blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) || (blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) ||
    630                   (IsPixEqual(I, F) && IsPixEqual(F, C) && IsPixEqual(C, B) && IsPixEqual(B, A) && !IsPixEqual(E, C))));
    631 
    632     float2 origin = float2(1.0 / sqrt(2.0), 0.0);
    633     float2 direction = float2(-1.0, -1.0);
    634 
    635     if(doLineBlend)
    636     {
    637       bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_B_I <= dist_F_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(H,I);
    638       bool haveSteepLine  = (STEEP_DIRECTION_THRESHOLD * dist_F_A <= dist_B_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(D,A);
    639       origin = haveShallowLine? float2(0.25, 0.0) : float2(0.5, 0.0);
    640       direction.y -= haveShallowLine? 1.0: 0.0;
    641       direction.x -= haveSteepLine? 1.0: 0.0;
    642     }
    643 
    644     float4 blendPix = lerp(F,B, step(DistYCbCr(E, B), DistYCbCr(E, F)));
    645     float blendW = lerp(Fw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, F)));
    646     res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
    647     resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
    648   }
    649 
    650   // Pixel Tap Mapping: -|x|x|-|-
    651   //                    x|A|B|C|-
    652   //                    x|D|E|F|-
    653   //                    -|G|H|-|-
    654   //                    -|-|-|-|-
    655   if(blendResult.x != BLEND_NONE)
    656   {
    657     float dist_D_C = DistYCbCr(D, C);
    658     float dist_B_G = DistYCbCr(B, G);
    659     bool doLineBlend = (blendResult.x == BLEND_DOMINANT ||
    660                 !((blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) || (blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) ||
    661                   (IsPixEqual(C, B) && IsPixEqual(B, A) && IsPixEqual(A, D) && IsPixEqual(D, G) && !IsPixEqual(E, A))));
    662 
    663     float2 origin = float2(0.0, -1.0 / sqrt(2.0));
    664     float2 direction = float2(-1.0, 1.0);
    665     if(doLineBlend)
    666     {
    667       bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_D_C <= dist_B_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(F,C);
    668       bool haveSteepLine  = (STEEP_DIRECTION_THRESHOLD * dist_B_G <= dist_D_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(H,G);
    669       origin = haveShallowLine? float2(0.0, -0.25) : float2(0.0, -0.5);
    670       direction.x -= haveShallowLine? 1.0: 0.0;
    671       direction.y += haveSteepLine? 1.0: 0.0;
    672     }
    673 
    674     float4 blendPix = lerp(D,B, step(DistYCbCr(E, B), DistYCbCr(E, D)));
    675     float blendW = lerp(Dw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, D)));
    676     res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
    677     resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
    678   }
    679 
    680   ialpha = res.w;
    681   texcol = float4(res.xyz, resW);
    682 
    683   // Compensate for partially transparent sampling.
    684   if (ialpha > 0.0)
    685     texcol.rgb /= float3(ialpha, ialpha, ialpha);
    686 
    687 #if BINALPHA
    688   ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
    689 #endif
    690 }
    691 
    692 #undef P
    693 
    694 )";
    695   }
    696 }
    697 
    698 std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(
    699   GPU_HW::BatchRenderMode render_mode, GPUTransparencyMode transparency, GPU_HW::BatchTextureMode texture_mode,
    700   GPUTextureFilter texture_filtering, bool uv_limits, bool force_round_texcoords, bool dithering, bool interlacing,
    701   bool check_mask, bool use_rov, bool use_rov_depth, bool rov_depth_test)
    702 {
    703   // TODO: don't write depth for shader blend
    704   DebugAssert(transparency == GPUTransparencyMode::Disabled || render_mode == GPU_HW::BatchRenderMode::ShaderBlend);
    705   DebugAssert(!rov_depth_test || (use_rov && use_rov_depth));
    706 
    707   const bool textured = (texture_mode != GPU_HW::BatchTextureMode::Disabled);
    708   const bool palette =
    709     (texture_mode == GPU_HW::BatchTextureMode::Palette4Bit || texture_mode == GPU_HW::BatchTextureMode::Palette8Bit);
    710   const bool shader_blending = (render_mode == GPU_HW::BatchRenderMode::ShaderBlend);
    711   const bool use_dual_source = (!shader_blending && !use_rov && m_supports_dual_source_blend &&
    712                                 ((render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled &&
    713                                   render_mode != GPU_HW::BatchRenderMode::OnlyOpaque) ||
    714                                  texture_filtering != GPUTextureFilter::Nearest));
    715 
    716   std::stringstream ss;
    717   WriteHeader(ss, use_rov);
    718   DefineMacro(ss, "TRANSPARENCY", render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled);
    719   DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", render_mode == GPU_HW::BatchRenderMode::OnlyOpaque);
    720   DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENT", render_mode == GPU_HW::BatchRenderMode::OnlyTransparent);
    721   DefineMacro(ss, "TRANSPARENCY_MODE", static_cast<s32>(transparency));
    722   DefineMacro(ss, "SHADER_BLENDING", shader_blending);
    723   DefineMacro(ss, "CHECK_MASK_BIT", check_mask);
    724   DefineMacro(ss, "TEXTURED", textured);
    725   DefineMacro(ss, "PALETTE", palette);
    726   DefineMacro(ss, "PALETTE_4_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette4Bit);
    727   DefineMacro(ss, "PALETTE_8_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette8Bit);
    728   DefineMacro(ss, "DITHERING", dithering);
    729   DefineMacro(ss, "DITHERING_SCALED", m_scaled_dithering);
    730   // Debanding requires true color to work correctly.
    731   DefineMacro(ss, "DEBANDING", m_true_color && m_debanding);
    732   DefineMacro(ss, "INTERLACING", interlacing);
    733   DefineMacro(ss, "TRUE_COLOR", m_true_color);
    734   DefineMacro(ss, "TEXTURE_FILTERING", texture_filtering != GPUTextureFilter::Nearest);
    735   DefineMacro(ss, "UV_LIMITS", uv_limits);
    736   DefineMacro(ss, "USE_ROV", use_rov);
    737   DefineMacro(ss, "USE_ROV_DEPTH", use_rov_depth);
    738   DefineMacro(ss, "ROV_DEPTH_TEST", rov_depth_test);
    739   DefineMacro(ss, "USE_DUAL_SOURCE", use_dual_source);
    740   DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth);
    741   DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords);
    742   DefineMacro(ss, "UPSCALED", m_resolution_scale > 1);
    743 
    744   WriteCommonFunctions(ss);
    745   WriteBatchUniformBuffer(ss);
    746   DeclareTexture(ss, "samp0", 0);
    747 
    748   if (use_rov)
    749   {
    750     DeclareImage(ss, "rov_color", 0);
    751     if (use_rov_depth)
    752       DeclareImage(ss, "rov_depth", 1, true);
    753   }
    754 
    755   if (m_glsl)
    756     ss << "CONSTANT int[16] s_dither_values = int[16]( ";
    757   else
    758     ss << "CONSTANT int s_dither_values[] = {";
    759   for (u32 i = 0; i < 16; i++)
    760   {
    761     if (i > 0)
    762       ss << ", ";
    763     ss << DITHER_MATRIX[i / 4][i % 4];
    764   }
    765   if (m_glsl)
    766     ss << " );\n";
    767   else
    768     ss << "};\n";
    769 
    770   ss << R"(
    771 uint3 ApplyDithering(uint2 coord, uint3 icol)
    772 {
    773   #if DITHERING_SCALED
    774     uint2 fc = coord & uint2(3u, 3u);
    775   #else
    776     uint2 fc = (coord / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & uint2(3u, 3u);
    777   #endif
    778   int offset = s_dither_values[fc.y * 4u + fc.x];
    779 
    780   #if !TRUE_COLOR
    781     return uint3(clamp((int3(icol) + int3(offset, offset, offset)) >> 3, 0, 31));
    782   #else
    783     return uint3(clamp(int3(icol) + int3(offset, offset, offset), 0, 255));
    784   #endif
    785 }
    786 
    787 #if TEXTURED
    788 CONSTANT float4 TRANSPARENT_PIXEL_COLOR = float4(0.0, 0.0, 0.0, 0.0);
    789 
    790 #if PALETTE
    791   #define TEXPAGE_VALUE uint4
    792 #else
    793   #define TEXPAGE_VALUE uint2
    794 #endif
    795 
    796 uint2 ApplyTextureWindow(uint2 coords)
    797 {
    798   uint x = (uint(coords.x) & u_texture_window_and.x) | u_texture_window_or.x;
    799   uint y = (uint(coords.y) & u_texture_window_and.y) | u_texture_window_or.y;
    800   return uint2(x, y);
    801 }
    802 
    803 uint2 FloatToIntegerCoords(float2 coords)
    804 {
    805   // With the vertex offset applied at 1x resolution scale, we want to round the texture coordinates.
    806   // Floor them otherwise, as it currently breaks when upscaling as the vertex offset is not applied.
    807   return uint2((RESOLUTION_SCALE == 1u || FORCE_ROUND_TEXCOORDS != 0) ? roundEven(coords) : floor(coords));
    808 }
    809 
    810 float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords)
    811 {
    812   #if PALETTE
    813     uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
    814 
    815     uint2 vicoord;
    816     #if PALETTE_4_BIT
    817       // 4bit will never wrap, since it's in the last texpage row.
    818       vicoord = uint2(texpage.x + (icoord.x / 4u), texpage.y + icoord.y);
    819     #elif PALETTE_8_BIT
    820       // 8bit can wrap in the X direction.
    821       vicoord = uint2((texpage.x + (icoord.x / 2u)) & 0x3FFu, texpage.y + icoord.y);
    822     #endif
    823 
    824     // load colour/palette
    825     float4 texel = LOAD_TEXTURE(samp0, int2(vicoord * RESOLUTION_SCALE), 0);
    826     uint vram_value = RGBA8ToRGBA5551(texel);
    827 
    828     // apply palette
    829     #if PALETTE_4_BIT
    830       uint subpixel = icoord.x & 3u;
    831       uint palette_index = (vram_value >> (subpixel * 4u)) & 0x0Fu;
    832       uint2 palette_icoord = uint2((texpage.z + palette_index), texpage.w);
    833     #elif PALETTE_8_BIT
    834       // can only wrap in X direction for 8-bit, 4-bit will fit in texpage size.
    835       uint subpixel = icoord.x & 1u;
    836       uint palette_index = (vram_value >> (subpixel * 8u)) & 0xFFu;
    837       uint2 palette_icoord = uint2(((texpage.z + palette_index) & 0x3FFu), texpage.w);
    838     #endif
    839 
    840     return LOAD_TEXTURE(samp0, int2(palette_icoord * RESOLUTION_SCALE), 0);
    841   #else
    842     // Direct texturing - usually render-to-texture effects.
    843     uint2 vicoord;
    844     #if !UPSCALED
    845       uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
    846       vicoord = (texpage.xy + icoord) & uint2(1023, 511);
    847     #else
    848       // Coordinates are already upscaled, we need to downscale them to apply the texture
    849       // window, then re-upscale/offset. We can't round here, because it could result in
    850       // going outside of the texture window.
    851       float2 ncoords = coords / float(RESOLUTION_SCALE);
    852       float2 nfpart = frac(ncoords);
    853       uint2 nicoord = ApplyTextureWindow(uint2(floor(ncoords)));
    854       uint2 nvicoord = (texpage.xy + nicoord) & uint2(1023, 511);
    855       coords = (float2(nvicoord) + nfpart) * float(RESOLUTION_SCALE);
    856       vicoord = uint2(floor(coords));
    857     #endif
    858 
    859     return LOAD_TEXTURE(samp0, int2(vicoord), 0);
    860   #endif
    861 }
    862 
    863 #endif
    864 
    865 // From https://alex.vlachos.com/graphics/Alex_Vlachos_Advanced_VR_Rendering_GDC2015.pdf
    866 // and https://www.shadertoy.com/view/MslGR8 (5th one starting from the bottom)
    867 // NOTE: `frag_coord` is in pixels (i.e. not normalized UV).
    868 float3 ApplyDebanding(float2 frag_coord)
    869 {
    870 #if DEBANDING
    871   // Iestyn's RGB dither (7 asm instructions) from Portal 2 X360, slightly modified for VR.
    872   float ditherc = dot(vec2(171.0, 231.0), frag_coord);
    873   float3 dither = float3(ditherc, ditherc, ditherc);
    874   dither = fract(dither / float3(103.0, 71.0, 97.0));
    875 
    876   // Subtract 0.5 to avoid slightly brightening the whole viewport.
    877   return (dither - 0.5) / 255.0;
    878 #else
    879   return float3(0.0, 0.0, 0.0);
    880 #endif
    881 }
    882 )";
    883 
    884   const u32 num_fragment_outputs = use_rov ? 0 : (use_dual_source ? 2 : 1);
    885   if (textured)
    886   {
    887     if (texture_filtering != GPUTextureFilter::Nearest)
    888       WriteBatchTextureFilter(ss, texture_filtering);
    889 
    890     if (uv_limits)
    891     {
    892       DeclareFragmentEntryPoint(ss, 1, 1,
    893                                 {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"},
    894                                  {"nointerpolation", "float4 v_uv_limits"}},
    895                                 true, num_fragment_outputs, use_dual_source, m_write_mask_as_depth, UsingMSAA(),
    896                                 UsingPerSampleShading(), false, m_disable_color_perspective,
    897                                 shader_blending && !use_rov, use_rov);
    898     }
    899     else
    900     {
    901       DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, true,
    902                                 num_fragment_outputs, use_dual_source, m_write_mask_as_depth, UsingMSAA(),
    903                                 UsingPerSampleShading(), false, m_disable_color_perspective,
    904                                 shader_blending && !use_rov, use_rov);
    905     }
    906   }
    907   else
    908   {
    909     DeclareFragmentEntryPoint(ss, 1, 0, {}, true, num_fragment_outputs, use_dual_source, m_write_mask_as_depth,
    910                               UsingMSAA(), UsingPerSampleShading(), false, m_disable_color_perspective,
    911                               shader_blending && !use_rov, use_rov);
    912   }
    913 
    914   ss << R"(
    915 {
    916   uint3 vertcol = uint3(v_col0.rgb * float3(255.0, 255.0, 255.0) + ApplyDebanding(v_pos.xy));
    917   uint2 fragpos = uint2(v_pos.xy);
    918 
    919   bool semitransparent;
    920   uint3 icolor;
    921   float ialpha;
    922   float oalpha;
    923 
    924   #if INTERLACING
    925     if ((fragpos.y & 1u) == u_interlaced_displayed_field)
    926       discard;
    927   #endif
    928 
    929   #if TEXTURED
    930     float4 texcol;
    931     #if TEXTURE_FILTERING
    932       FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha);
    933       if (ialpha < 0.5)
    934         discard;
    935     #else
    936       #if UV_LIMITS
    937         texcol = SampleFromVRAM(v_texpage, clamp(v_tex0, v_uv_limits.xy, v_uv_limits.zw));
    938       #else
    939         texcol = SampleFromVRAM(v_texpage, v_tex0);
    940       #endif
    941       if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR))
    942         discard;
    943 
    944       ialpha = 1.0;
    945     #endif
    946 
    947     semitransparent = (texcol.a >= 0.5);
    948 
    949     // If not using true color, truncate the framebuffer colors to 5-bit.
    950     #if !TRUE_COLOR
    951       icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3;
    952       icolor = (icolor * vertcol) >> 4;
    953       #if DITHERING
    954         icolor = ApplyDithering(fragpos, icolor);
    955       #else
    956         icolor = min(icolor >> 3, uint3(31u, 31u, 31u));
    957       #endif
    958     #else
    959       icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0) + ApplyDebanding(v_pos.xy));
    960       icolor = (icolor * vertcol) >> 7;
    961       #if DITHERING
    962         icolor = ApplyDithering(fragpos, icolor);
    963       #else
    964         icolor = min(icolor, uint3(255u, 255u, 255u));
    965       #endif
    966     #endif
    967 
    968     // Compute output alpha (mask bit)
    969     oalpha = float(u_set_mask_while_drawing ? 1 : int(semitransparent));
    970   #else
    971     // All pixels are semitransparent for untextured polygons.
    972     semitransparent = true;
    973     icolor = vertcol;
    974     ialpha = 1.0;
    975 
    976     #if DITHERING
    977       icolor = ApplyDithering(fragpos, icolor);
    978     #else
    979       #if !TRUE_COLOR
    980         icolor >>= 3;
    981       #endif
    982     #endif
    983 
    984     // However, the mask bit is cleared if set mask bit is false.
    985     oalpha = float(u_set_mask_while_drawing);
    986   #endif
    987 
    988   #if SHADER_BLENDING
    989     #if USE_ROV
    990       BEGIN_ROV_REGION;
    991       float4 bg_col = ROV_LOAD(rov_color, fragpos);
    992       float4 o_col0;
    993       bool discarded = false;
    994 
    995       #if ROV_DEPTH_TEST
    996         float bg_depth = ROV_LOAD(rov_depth, fragpos).r;
    997         discarded = (v_pos.z > bg_depth);
    998       #endif
    999       #if CHECK_MASK_BIT
   1000         discarded = discarded || (bg_col.a != 0.0);
   1001       #endif        
   1002     #else
   1003       float4 bg_col = LAST_FRAG_COLOR;
   1004       #if CHECK_MASK_BIT
   1005         if (bg_col.a != 0.0)
   1006           discard;
   1007       #endif
   1008     #endif
   1009 
   1010     // Work in normalized space for true colour, matches HW blend.
   1011     float4 fg_col = float4(float3(icolor), oalpha);
   1012     #if TRUE_COLOR
   1013       fg_col.rgb /= 255.0;
   1014     #elif TRANSPARENCY // rgb not used in check-mask only
   1015       bg_col.rgb = roundEven(bg_col.rgb * 31.0);
   1016     #endif
   1017 
   1018     #if TEXTURE_FILTERING
   1019       #if TRANSPARENCY_MODE == 0 || TRANSPARENCY_MODE == 3
   1020         bg_col.rgb /= ialpha;
   1021       #endif
   1022       fg_col.rgb *= ialpha;
   1023     #endif
   1024 
   1025     o_col0.a = fg_col.a;
   1026     #if TRANSPARENCY_MODE == 0 // Half BG + Half FG.
   1027       o_col0.rgb = (bg_col.rgb * 0.5) + (fg_col.rgb * 0.5);
   1028     #elif TRANSPARENCY_MODE == 1 // BG + FG
   1029       o_col0.rgb = bg_col.rgb + fg_col.rgb;
   1030     #elif TRANSPARENCY_MODE == 2 // BG - FG
   1031       o_col0.rgb = bg_col.rgb - fg_col.rgb;
   1032     #elif TRANSPARENCY_MODE == 3 // BG + 1/4 FG.
   1033       o_col0.rgb = bg_col.rgb + (fg_col.rgb * 0.25);
   1034     #else
   1035       o_col0.rgb = fg_col.rgb;
   1036     #endif
   1037 
   1038     // 16-bit truncation.
   1039     #if !TRUE_COLOR && TRANSPARENCY
   1040       o_col0.rgb = floor(o_col0.rgb);
   1041     #endif
   1042 
   1043     #if TRANSPARENCY
   1044       // If pixel isn't marked as semitransparent, replace with previous colour.
   1045       o_col0 = semitransparent ? o_col0 : fg_col;
   1046     #endif
   1047 
   1048     // Normalize for non-true-color.
   1049     #if !TRUE_COLOR
   1050       o_col0.rgb /= 31.0;
   1051     #endif
   1052 
   1053     #if USE_ROV
   1054       if (!discarded)
   1055       {
   1056         ROV_STORE(rov_color, fragpos, o_col0);
   1057         #if USE_ROV_DEPTH
   1058           ROV_STORE(rov_depth, fragpos, float4(v_pos.z, 0.0, 0.0, 0.0));
   1059         #endif
   1060       }
   1061       END_ROV_REGION;
   1062     #endif
   1063   #else
   1064     // Premultiply alpha so we don't need to use a colour output for it.
   1065     float premultiply_alpha = ialpha;
   1066     #if TRANSPARENCY
   1067       premultiply_alpha = ialpha * (semitransparent ? u_src_alpha_factor : 1.0);
   1068     #endif
   1069 
   1070     float3 color;
   1071     #if !TRUE_COLOR
   1072       // We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color
   1073       // into the blend unit, which can cause a small amount of error to accumulate.
   1074       color = floor(float3(icolor) * premultiply_alpha) / 31.0;
   1075     #else
   1076       // True color is actually simpler here since we want to preserve the precision.
   1077       color = (float3(icolor) * premultiply_alpha) / 255.0;
   1078     #endif
   1079 
   1080     #if TRANSPARENCY && TEXTURED
   1081       // Apply semitransparency. If not a semitransparent texel, destination alpha is ignored.
   1082       if (semitransparent)
   1083       {
   1084         #if USE_DUAL_SOURCE
   1085           o_col0 = float4(color, oalpha);
   1086           o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha);
   1087         #else
   1088           o_col0 = float4(color, oalpha);
   1089         #endif
   1090 
   1091         #if WRITE_MASK_AS_DEPTH
   1092           o_depth = oalpha * v_pos.z;
   1093         #endif
   1094 
   1095         #if TRANSPARENCY_ONLY_OPAQUE
   1096           discard;
   1097         #endif
   1098       }
   1099       else
   1100       {
   1101         #if USE_DUAL_SOURCE
   1102           o_col0 = float4(color, oalpha);
   1103           o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha);
   1104         #else
   1105           o_col0 = float4(color, oalpha);
   1106         #endif
   1107 
   1108         #if WRITE_MASK_AS_DEPTH
   1109           o_depth = oalpha * v_pos.z;
   1110         #endif
   1111 
   1112         #if TRANSPARENCY_ONLY_TRANSPARENT
   1113           discard;
   1114         #endif
   1115       }
   1116     #elif TRANSPARENCY
   1117       // We shouldn't be rendering opaque geometry only when untextured, so no need to test/discard here.
   1118       #if USE_DUAL_SOURCE
   1119         o_col0 = float4(color, oalpha);
   1120         o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha);
   1121       #else
   1122         o_col0 = float4(color, oalpha);
   1123       #endif
   1124 
   1125       #if WRITE_MASK_AS_DEPTH
   1126         o_depth = oalpha * v_pos.z;
   1127       #endif
   1128     #else
   1129       // Non-transparency won't enable blending so we can write the mask here regardless.
   1130       o_col0 = float4(color, oalpha);
   1131 
   1132       #if USE_DUAL_SOURCE
   1133         o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha);
   1134       #endif
   1135 
   1136       #if WRITE_MASK_AS_DEPTH
   1137         o_depth = oalpha * v_pos.z;
   1138       #endif
   1139     #endif
   1140   #endif
   1141 }
   1142 )";
   1143 
   1144   return ss.str();
   1145 }
   1146 
   1147 std::string GPU_HW_ShaderGen::GenerateVRAMExtractFragmentShader(bool color_24bit, bool depth_buffer)
   1148 {
   1149   std::stringstream ss;
   1150   WriteHeader(ss);
   1151   DefineMacro(ss, "COLOR_24BIT", color_24bit);
   1152   DefineMacro(ss, "DEPTH_BUFFER", depth_buffer);
   1153   DefineMacro(ss, "MULTISAMPLED", UsingMSAA());
   1154 
   1155   WriteCommonFunctions(ss);
   1156   DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "uint u_skip_x", "uint u_line_skip"}, true);
   1157   DeclareTexture(ss, "samp0", 0, UsingMSAA());
   1158   if (depth_buffer)
   1159     DeclareTexture(ss, "samp1", 1, UsingMSAA());
   1160 
   1161   ss << R"(
   1162 float4 LoadVRAM(int2 coords)
   1163 {
   1164 #if MULTISAMPLING
   1165   float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u);
   1166   FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
   1167     value += LOAD_TEXTURE_MS(samp0, coords, sample_index);
   1168   value /= float(MULTISAMPLES);
   1169   return value;
   1170 #else
   1171   return LOAD_TEXTURE(samp0, coords, 0);
   1172 #endif
   1173 }
   1174 
   1175 #if DEPTH_BUFFER
   1176 float LoadDepth(int2 coords)
   1177 {
   1178   // Need to duplicate because different types in different languages...
   1179 #if MULTISAMPLING
   1180   float value = LOAD_TEXTURE_MS(samp1, coords, 0u).r;
   1181   FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
   1182     value += LOAD_TEXTURE_MS(samp1, coords, sample_index).r;
   1183   value /= float(MULTISAMPLES);
   1184   return value;
   1185 #else
   1186   return LOAD_TEXTURE(samp1, coords, 0).r;
   1187 #endif
   1188 }
   1189 #endif
   1190 
   1191 float3 SampleVRAM24(uint2 icoords)
   1192 {
   1193   // load adjacent 16-bit texels
   1194   uint2 clamp_size = uint2(1024, 512);
   1195 
   1196   // relative to start of scanout
   1197   uint2 vram_coords = u_vram_offset + uint2((icoords.x * 3u) / 2u, icoords.y);
   1198   uint s0 = RGBA8ToRGBA5551(LoadVRAM(int2((vram_coords % clamp_size) * RESOLUTION_SCALE)));
   1199   uint s1 = RGBA8ToRGBA5551(LoadVRAM(int2(((vram_coords + uint2(1, 0)) % clamp_size) * RESOLUTION_SCALE)));
   1200 
   1201   // select which part of the combined 16-bit texels we are currently shading
   1202   uint s1s0 = ((s1 << 16) | s0) >> ((icoords.x & 1u) * 8u);
   1203 
   1204   // extract components and normalize
   1205   return float3(float(s1s0 & 0xFFu) / 255.0, float((s1s0 >> 8u) & 0xFFu) / 255.0,
   1206                 float((s1s0 >> 16u) & 0xFFu) / 255.0);
   1207 }
   1208 )";
   1209 
   1210   DeclareFragmentEntryPoint(ss, 0, 1, {}, true, depth_buffer ? 2 : 1);
   1211   ss << R"(
   1212 {
   1213   uint2 icoords = uint2(uint(v_pos.x) + u_skip_x, uint(v_pos.y) << u_line_skip);
   1214   int2 wrapped_coords = int2((icoords + u_vram_offset) % VRAM_SIZE);
   1215 
   1216   #if COLOR_24BIT
   1217     o_col0 = float4(SampleVRAM24(icoords), 1.0);
   1218   #else
   1219     o_col0 = float4(LoadVRAM(wrapped_coords).rgb, 1.0);
   1220   #endif
   1221 
   1222   #if DEPTH_BUFFER
   1223     o_col1 = float4(LoadDepth(wrapped_coords), 0.0, 0.0, 0.0);
   1224   #endif
   1225 }
   1226 )";
   1227 
   1228   return ss.str();
   1229 }
   1230 
   1231 std::string GPU_HW_ShaderGen::GenerateWireframeGeometryShader()
   1232 {
   1233   std::stringstream ss;
   1234   WriteHeader(ss);
   1235   WriteCommonFunctions(ss);
   1236 
   1237   if (m_glsl)
   1238   {
   1239     ss << R"(
   1240 layout(triangles) in;
   1241 layout(line_strip, max_vertices = 6) out;
   1242 
   1243 void main()
   1244 {
   1245   gl_Position = gl_in[0].gl_Position;
   1246   EmitVertex();
   1247   gl_Position = gl_in[1].gl_Position;
   1248   EmitVertex();
   1249   EndPrimitive();
   1250   gl_Position = gl_in[1].gl_Position;
   1251   EmitVertex();
   1252   gl_Position = gl_in[2].gl_Position;
   1253   EmitVertex();
   1254   EndPrimitive();
   1255   gl_Position = gl_in[2].gl_Position;
   1256   EmitVertex();
   1257   gl_Position = gl_in[0].gl_Position;
   1258   EmitVertex();
   1259   EndPrimitive();
   1260 }
   1261 )";
   1262   }
   1263   else
   1264   {
   1265     ss << R"(
   1266 struct GSInput
   1267 {
   1268   float4 col0 : COLOR0;
   1269   float4 pos : SV_Position;
   1270 };
   1271 
   1272 struct GSOutput
   1273 {
   1274   float4 pos : SV_Position;
   1275 };
   1276 
   1277 GSOutput GetVertex(GSInput vi)
   1278 {
   1279   GSOutput vo;
   1280   vo.pos = vi.pos;
   1281   return vo;
   1282 }
   1283 
   1284 [maxvertexcount(6)]
   1285 void main(triangle GSInput input[3], inout LineStream<GSOutput> output)
   1286 {
   1287   output.Append(GetVertex(input[0]));
   1288   output.Append(GetVertex(input[1]));
   1289   output.RestartStrip();
   1290 
   1291   output.Append(GetVertex(input[1]));
   1292   output.Append(GetVertex(input[2]));
   1293   output.RestartStrip();
   1294 
   1295   output.Append(GetVertex(input[2]));
   1296   output.Append(GetVertex(input[0]));
   1297   output.RestartStrip();
   1298 }
   1299 )";
   1300   }
   1301 
   1302   return ss.str();
   1303 }
   1304 
   1305 std::string GPU_HW_ShaderGen::GenerateWireframeFragmentShader()
   1306 {
   1307   std::stringstream ss;
   1308   WriteHeader(ss);
   1309   WriteCommonFunctions(ss);
   1310 
   1311   DeclareFragmentEntryPoint(ss, 0, 0);
   1312   ss << R"(
   1313 {
   1314   o_col0 = float4(1.0, 1.0, 1.0, 0.5);
   1315 }
   1316 )";
   1317 
   1318   return ss.str();
   1319 }
   1320 
   1321 std::string GPU_HW_ShaderGen::GenerateVRAMReadFragmentShader()
   1322 {
   1323   std::stringstream ss;
   1324   WriteHeader(ss);
   1325   WriteCommonFunctions(ss);
   1326   DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_size"}, true);
   1327 
   1328   DeclareTexture(ss, "samp0", 0, UsingMSAA());
   1329 
   1330   ss << R"(
   1331 float4 LoadVRAM(int2 coords)
   1332 {
   1333 #if MULTISAMPLING
   1334   float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u);
   1335   FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
   1336     value += LOAD_TEXTURE_MS(samp0, coords, sample_index);
   1337   value /= float(MULTISAMPLES);
   1338   return value;
   1339 #else
   1340   return LOAD_TEXTURE(samp0, coords, 0);
   1341 #endif
   1342 }
   1343 
   1344 uint SampleVRAM(uint2 coords)
   1345 {
   1346   if (RESOLUTION_SCALE == 1u)
   1347     return RGBA8ToRGBA5551(LoadVRAM(int2(coords)));
   1348 
   1349   // Box filter for downsampling.
   1350   float4 value = float4(0.0, 0.0, 0.0, 0.0);
   1351   uint2 base_coords = coords * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE);
   1352   for (uint offset_x = 0u; offset_x < RESOLUTION_SCALE; offset_x++)
   1353   {
   1354     for (uint offset_y = 0u; offset_y < RESOLUTION_SCALE; offset_y++)
   1355       value += LoadVRAM(int2(base_coords + uint2(offset_x, offset_y)));
   1356   }
   1357   value /= float(RESOLUTION_SCALE * RESOLUTION_SCALE);
   1358   return RGBA8ToRGBA5551(value);
   1359 }
   1360 )";
   1361 
   1362   DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1);
   1363   ss << R"(
   1364 {
   1365   uint2 sample_coords = uint2(uint(v_pos.x) * 2u, uint(v_pos.y));
   1366   sample_coords += u_base_coords;
   1367 
   1368   // We're encoding as 32-bit, so the output width is halved and we pack two 16-bit pixels in one 32-bit pixel.
   1369   uint left = SampleVRAM(sample_coords);
   1370   uint right = SampleVRAM(uint2(sample_coords.x + 1u, sample_coords.y));
   1371 
   1372   o_col0 = float4(float(left & 0xFFu), float((left >> 8) & 0xFFu),
   1373                   float(right & 0xFFu), float((right >> 8) & 0xFFu))
   1374             / float4(255.0, 255.0, 255.0, 255.0);
   1375 })";
   1376 
   1377   return ss.str();
   1378 }
   1379 
   1380 std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_buffer, bool use_ssbo)
   1381 {
   1382   std::stringstream ss;
   1383   WriteHeader(ss);
   1384   WriteCommonFunctions(ss);
   1385   DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth);
   1386   DefineMacro(ss, "USE_BUFFER", use_buffer);
   1387   DeclareUniformBuffer(ss,
   1388                        {"uint2 u_base_coords", "uint2 u_end_coords", "uint2 u_size", "uint u_buffer_base_offset",
   1389                         "uint u_mask_or_bits", "float u_depth_value"},
   1390                        true);
   1391 
   1392   if (!use_buffer)
   1393   {
   1394     DeclareTexture(ss, "samp0", 0, false, true, true);
   1395   }
   1396   else if (use_ssbo && m_glsl)
   1397   {
   1398     ss << "layout(std430";
   1399     if (IsVulkan())
   1400       ss << ", set = 0, binding = 0";
   1401     else if (IsMetal())
   1402       ss << ", set = 0, binding = 1";
   1403     else if (m_use_glsl_binding_layout)
   1404       ss << ", binding = 0";
   1405 
   1406     ss << ") readonly restrict buffer SSBO {\n";
   1407     ss << "  uint ssbo_data[];\n";
   1408     ss << "};\n\n";
   1409 
   1410     ss << "#define GET_VALUE(buffer_offset) (ssbo_data[(buffer_offset) / 2u] >> (((buffer_offset) % 2u) * 16u))\n\n";
   1411   }
   1412   else
   1413   {
   1414     DeclareTextureBuffer(ss, "samp0", 0, true, true);
   1415     ss << "#define GET_VALUE(buffer_offset) (LOAD_TEXTURE_BUFFER(samp0, int(buffer_offset)).r)\n\n";
   1416   }
   1417 
   1418   DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, m_write_mask_as_depth);
   1419   ss << R"(
   1420 {
   1421   uint2 coords = uint2(v_pos.xy) / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE);
   1422 
   1423   // make sure it's not oversized and out of range
   1424   if ((coords.x < u_base_coords.x && coords.x >= u_end_coords.x) ||
   1425       (coords.y < u_base_coords.y && coords.y >= u_end_coords.y))
   1426   {
   1427     discard;
   1428   }
   1429 
   1430   // find offset from the start of the row/column
   1431   uint2 offset;
   1432   offset.x = (coords.x < u_base_coords.x) ? ((VRAM_SIZE.x / RESOLUTION_SCALE) - u_base_coords.x + coords.x) : (coords.x - u_base_coords.x);
   1433   offset.y = (coords.y < u_base_coords.y) ? ((VRAM_SIZE.y / RESOLUTION_SCALE) - u_base_coords.y + coords.y) : (coords.y - u_base_coords.y);
   1434 
   1435 #if !USE_BUFFER
   1436   uint value = LOAD_TEXTURE(samp0, int2(offset), 0).x;
   1437 #else
   1438   uint buffer_offset = u_buffer_base_offset + (offset.y * u_size.x) + offset.x;
   1439   uint value = GET_VALUE(buffer_offset) | u_mask_or_bits;
   1440 #endif
   1441 
   1442   o_col0 = RGBA5551ToRGBA8(value);
   1443 #if WRITE_MASK_AS_DEPTH
   1444   o_depth = (o_col0.a == 1.0) ? u_depth_value : 0.0;
   1445 #endif
   1446 })";
   1447 
   1448   return ss.str();
   1449 }
   1450 
   1451 std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader()
   1452 {
   1453   // TODO: This won't currently work because we can't bind the texture to both the shader and framebuffer.
   1454   const bool msaa = false;
   1455 
   1456   std::stringstream ss;
   1457   WriteHeader(ss);
   1458   WriteCommonFunctions(ss);
   1459   DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth);
   1460   DeclareUniformBuffer(ss,
   1461                        {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_end_coords", "uint2 u_size",
   1462                         "bool u_set_mask_bit", "float u_depth_value"},
   1463                        true);
   1464 
   1465   DeclareTexture(ss, "samp0", 0, msaa);
   1466   DefineMacro(ss, "MSAA_COPY", msaa);
   1467   DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, m_write_mask_as_depth, false, false, msaa);
   1468   ss << R"(
   1469 {
   1470   uint2 dst_coords = uint2(v_pos.xy);
   1471 
   1472   // make sure it's not oversized and out of range
   1473   if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) ||
   1474       (dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y))
   1475   {
   1476     discard;
   1477   }
   1478 
   1479   // find offset from the start of the row/column
   1480   uint2 offset;
   1481   offset.x = (dst_coords.x < u_dst_coords.x) ? (VRAM_SIZE.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x);
   1482   offset.y = (dst_coords.y < u_dst_coords.y) ? (VRAM_SIZE.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y);
   1483 
   1484   // find the source coordinates to copy from
   1485   uint2 src_coords = (u_src_coords + offset) % VRAM_SIZE;
   1486 
   1487   // sample and apply mask bit
   1488 #if MSAA_COPY
   1489   float4 color = LOAD_TEXTURE_MS(samp0, int2(src_coords), f_sample_index);
   1490 #else
   1491   float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0);
   1492 #endif
   1493   o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a);
   1494 #if WRITE_MASK_AS_DEPTH
   1495   o_depth = (u_set_mask_bit ? 1.0f : ((o_col0.a == 1.0) ? u_depth_value : 0.0));
   1496 #endif
   1497 })";
   1498 
   1499   return ss.str();
   1500 }
   1501 
   1502 std::string GPU_HW_ShaderGen::GenerateVRAMFillFragmentShader(bool wrapped, bool interlaced)
   1503 {
   1504   std::stringstream ss;
   1505   WriteHeader(ss);
   1506   WriteCommonFunctions(ss);
   1507   DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth);
   1508   DefineMacro(ss, "WRAPPED", wrapped);
   1509   DefineMacro(ss, "INTERLACED", interlaced);
   1510 
   1511   DeclareUniformBuffer(
   1512     ss, {"uint2 u_dst_coords", "uint2 u_end_coords", "float4 u_fill_color", "uint u_interlaced_displayed_field"}, true);
   1513 
   1514   DeclareFragmentEntryPoint(ss, 0, 1, {}, interlaced || wrapped, 1, false, m_write_mask_as_depth, false, false, false);
   1515   ss << R"(
   1516 {
   1517 #if INTERLACED || WRAPPED
   1518   uint2 dst_coords = uint2(v_pos.xy);
   1519 #endif
   1520 
   1521 #if INTERLACED
   1522   if ((dst_coords.y & 1u) == u_interlaced_displayed_field)
   1523     discard;
   1524 #endif
   1525 
   1526 #if WRAPPED
   1527   // make sure it's not oversized and out of range
   1528   if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) ||
   1529       (dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y))
   1530   {
   1531     discard;
   1532   }
   1533 #endif
   1534 
   1535   o_col0 = u_fill_color;
   1536 #if WRITE_MASK_AS_DEPTH
   1537   o_depth = u_fill_color.a;
   1538 #endif
   1539 })";
   1540 
   1541   return ss.str();
   1542 }
   1543 
   1544 std::string GPU_HW_ShaderGen::GenerateVRAMUpdateDepthFragmentShader()
   1545 {
   1546   std::stringstream ss;
   1547   WriteHeader(ss);
   1548   WriteCommonFunctions(ss);
   1549   DeclareTexture(ss, "samp0", 0, UsingMSAA());
   1550   DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 0, false, true, false, false, UsingMSAA());
   1551 
   1552   ss << R"(
   1553 {
   1554 #if MULTISAMPLING
   1555   o_depth = LOAD_TEXTURE_MS(samp0, int2(v_pos.xy), f_sample_index).a;
   1556 #else
   1557   o_depth = LOAD_TEXTURE(samp0, int2(v_pos.xy), 0).a;
   1558 #endif
   1559 }
   1560 )";
   1561 
   1562   return ss.str();
   1563 }
   1564 
   1565 void GPU_HW_ShaderGen::WriteAdaptiveDownsampleUniformBuffer(std::stringstream& ss)
   1566 {
   1567   DeclareUniformBuffer(ss, {"float2 u_uv_min", "float2 u_uv_max", "float2 u_rcp_resolution", "float u_lod"}, true);
   1568 }
   1569 
   1570 std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleVertexShader()
   1571 {
   1572   std::stringstream ss;
   1573   WriteHeader(ss);
   1574   WriteAdaptiveDownsampleUniformBuffer(ss);
   1575   DeclareVertexEntryPoint(ss, {}, 0, 1, {}, true);
   1576   ss << R"(
   1577 {
   1578   v_tex0 = float2(float((v_id << 1) & 2u), float(v_id & 2u));
   1579   v_pos = float4(v_tex0 * float2(2.0f, -2.0f) + float2(-1.0f, 1.0f), 0.0f, 1.0f);
   1580   v_tex0 = u_uv_min + (u_uv_max - u_uv_min) * v_tex0;
   1581   #if API_OPENGL || API_OPENGL_ES || API_VULKAN
   1582     v_pos.y = -v_pos.y;
   1583   #endif
   1584 }
   1585 )";
   1586   return ss.str();
   1587 }
   1588 
   1589 std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleMipFragmentShader(bool first_pass)
   1590 {
   1591   std::stringstream ss;
   1592   WriteHeader(ss);
   1593   WriteCommonFunctions(ss);
   1594   WriteAdaptiveDownsampleUniformBuffer(ss);
   1595   DeclareTexture(ss, "samp0", 0, false);
   1596   DefineMacro(ss, "FIRST_PASS", first_pass);
   1597 
   1598   // mipmap_energy.glsl ported from parallel-rsx.
   1599   ss << R"(
   1600 
   1601 float4 get_bias(float3 c00, float3 c01, float3 c10, float3 c11)
   1602 {
   1603    // Measure the "energy" (variance) in the pixels.
   1604    // If the pixels are all the same (2D content), use maximum bias, otherwise, taper off quickly back to 0 (edges)
   1605    float3 avg = 0.25 * (c00 + c01 + c10 + c11);
   1606    float s00 = dot(c00 - avg, c00 - avg);
   1607    float s01 = dot(c01 - avg, c01 - avg);
   1608    float s10 = dot(c10 - avg, c10 - avg);
   1609    float s11 = dot(c11 - avg, c11 - avg);
   1610    return float4(avg, 1.0 - log2(1000.0 * (s00 + s01 + s10 + s11) + 1.0));
   1611 }
   1612 
   1613 float4 get_bias(float4 c00, float4 c01, float4 c10, float4 c11)
   1614 {
   1615    // Measure the "energy" (variance) in the pixels.
   1616    // If the pixels are all the same (2D content), use maximum bias, otherwise, taper off quickly back to 0 (edges)
   1617    float avg = 0.25 * (c00.a + c01.a + c10.a + c11.a);
   1618    float4 bias = get_bias(c00.rgb, c01.rgb, c10.rgb, c11.rgb);
   1619    bias.a *= avg;
   1620    return bias;
   1621 }
   1622 
   1623 )";
   1624 
   1625   DeclareFragmentEntryPoint(ss, 0, 1);
   1626   ss << R"(
   1627 {
   1628   float2 uv = v_tex0 - (u_rcp_resolution * 0.25);
   1629 #ifdef FIRST_PASS
   1630    vec3 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0)).rgb;
   1631    vec3 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1)).rgb;
   1632    vec3 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0)).rgb;
   1633    vec3 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1)).rgb;
   1634    o_col0 = get_bias(c00, c01, c10, c11);
   1635 #else
   1636    vec4 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0));
   1637    vec4 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1));
   1638    vec4 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0));
   1639    vec4 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1));
   1640    o_col0 = get_bias(c00, c01, c10, c11);
   1641 #endif
   1642 }
   1643 )";
   1644 
   1645   return ss.str();
   1646 }
   1647 
   1648 std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleBlurFragmentShader()
   1649 {
   1650   std::stringstream ss;
   1651   WriteHeader(ss);
   1652   WriteCommonFunctions(ss);
   1653   WriteAdaptiveDownsampleUniformBuffer(ss);
   1654   DeclareTexture(ss, "samp0", 0, false);
   1655 
   1656   // mipmap_blur.glsl ported from parallel-rsx.
   1657   DeclareFragmentEntryPoint(ss, 0, 1);
   1658   ss << R"(
   1659 {
   1660   float bias = 0.0;
   1661   const float w0 = 0.25;
   1662   const float w1 = 0.125;
   1663   const float w2 = 0.0625;
   1664 #define UV(x, y) clamp((v_tex0 + float2(x, y) * u_rcp_resolution), u_uv_min, u_uv_max)
   1665   bias += w2 * SAMPLE_TEXTURE(samp0, UV(-1.0, -1.0)).a;
   1666   bias += w2 * SAMPLE_TEXTURE(samp0, UV(+1.0, -1.0)).a;
   1667   bias += w2 * SAMPLE_TEXTURE(samp0, UV(-1.0, +1.0)).a;
   1668   bias += w2 * SAMPLE_TEXTURE(samp0, UV(+1.0, +1.0)).a;
   1669   bias += w1 * SAMPLE_TEXTURE(samp0, UV( 0.0, -1.0)).a;
   1670   bias += w1 * SAMPLE_TEXTURE(samp0, UV(-1.0,  0.0)).a;
   1671   bias += w1 * SAMPLE_TEXTURE(samp0, UV(+1.0,  0.0)).a;
   1672   bias += w1 * SAMPLE_TEXTURE(samp0, UV( 0.0, +1.0)).a;
   1673   bias += w0 * SAMPLE_TEXTURE(samp0, UV( 0.0,  0.0)).a;
   1674   o_col0 = float4(bias, bias, bias, bias);
   1675 }
   1676 )";
   1677 
   1678   return ss.str();
   1679 }
   1680 
   1681 std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleCompositeFragmentShader()
   1682 {
   1683   std::stringstream ss;
   1684   WriteHeader(ss);
   1685   WriteCommonFunctions(ss);
   1686   DeclareTexture(ss, "samp0", 0, false);
   1687   DeclareTexture(ss, "samp1", 1, false);
   1688 
   1689   // mipmap_resolve.glsl ported from parallel-rsx.
   1690   DeclareFragmentEntryPoint(ss, 0, 1, {}, true);
   1691   ss << R"(
   1692 {
   1693   float bias = SAMPLE_TEXTURE(samp1, v_tex0).r;
   1694   float mip = float(RESOLUTION_SCALE - 1u) * bias;
   1695   float3 color = SAMPLE_TEXTURE_LEVEL(samp0, v_tex0, mip).rgb;
   1696   o_col0 = float4(color, 1.0);
   1697 }
   1698 )";
   1699 
   1700   return ss.str();
   1701 }
   1702 
   1703 std::string GPU_HW_ShaderGen::GenerateBoxSampleDownsampleFragmentShader(u32 factor)
   1704 {
   1705   std::stringstream ss;
   1706   WriteHeader(ss);
   1707   WriteCommonFunctions(ss);
   1708   DeclareUniformBuffer(ss, {"uint2 u_base_coords"}, true);
   1709   DeclareTexture(ss, "samp0", 0, false);
   1710 
   1711   ss << "#define FACTOR " << factor << "\n";
   1712 
   1713   DeclareFragmentEntryPoint(ss, 0, 1, {}, true);
   1714   ss << R"(
   1715 {
   1716   float3 color = float3(0.0, 0.0, 0.0);
   1717   uint2 base_coords = u_base_coords + uint2(v_pos.xy) * uint2(FACTOR, FACTOR);
   1718   for (uint offset_x = 0u; offset_x < FACTOR; offset_x++)
   1719   {
   1720     for (uint offset_y = 0u; offset_y < FACTOR; offset_y++)
   1721       color += LOAD_TEXTURE(samp0, int2(base_coords + uint2(offset_x, offset_y)), 0).rgb;
   1722   }
   1723   color /= float(FACTOR * FACTOR);
   1724   o_col0 = float4(color, 1.0);
   1725 }
   1726 )";
   1727 
   1728   return ss.str();
   1729 }