gpu_hw_shadergen.cpp (62170B)
1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com> 2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) 3 4 #include "gpu_hw_shadergen.h" 5 6 #include "common/assert.h" 7 8 GPU_HW_ShaderGen::GPU_HW_ShaderGen(RenderAPI render_api, u32 resolution_scale, u32 multisamples, 9 bool per_sample_shading, bool true_color, bool scaled_dithering, 10 bool write_mask_as_depth, bool disable_color_perspective, 11 bool supports_dual_source_blend, bool supports_framebuffer_fetch, bool debanding) 12 : ShaderGen(render_api, GetShaderLanguageForAPI(render_api), supports_dual_source_blend, supports_framebuffer_fetch), 13 m_resolution_scale(resolution_scale), m_multisamples(multisamples), m_per_sample_shading(per_sample_shading), 14 m_true_color(true_color), m_scaled_dithering(scaled_dithering), m_write_mask_as_depth(write_mask_as_depth), 15 m_disable_color_perspective(disable_color_perspective), m_debanding(debanding) 16 { 17 } 18 19 GPU_HW_ShaderGen::~GPU_HW_ShaderGen() = default; 20 21 void GPU_HW_ShaderGen::WriteCommonFunctions(std::stringstream& ss) 22 { 23 DefineMacro(ss, "MULTISAMPLING", UsingMSAA()); 24 25 ss << "CONSTANT uint RESOLUTION_SCALE = " << m_resolution_scale << "u;\n"; 26 ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n"; 27 ss << "CONSTANT uint MULTISAMPLES = " << m_multisamples << "u;\n"; 28 ss << "CONSTANT bool PER_SAMPLE_SHADING = " << (m_per_sample_shading ? "true" : "false") << ";\n"; 29 ss << R"( 30 uint RGBA8ToRGBA5551(float4 v) 31 { 32 uint r = uint(roundEven(v.r * 31.0)); 33 uint g = uint(roundEven(v.g * 31.0)); 34 uint b = uint(roundEven(v.b * 31.0)); 35 uint a = (v.a != 0.0) ? 1u : 0u; 36 return (r) | (g << 5) | (b << 10) | (a << 15); 37 } 38 39 float4 RGBA5551ToRGBA8(uint v) 40 { 41 uint r = (v & 31u); 42 uint g = ((v >> 5) & 31u); 43 uint b = ((v >> 10) & 31u); 44 uint a = ((v >> 15) & 1u); 45 46 return float4(float(r) / 31.0, float(g) / 31.0, float(b) / 31.0, float(a)); 47 } 48 )"; 49 } 50 51 void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss) 52 { 53 DeclareUniformBuffer(ss, 54 {"uint2 u_texture_window_and", "uint2 u_texture_window_or", "float u_src_alpha_factor", 55 "float u_dst_alpha_factor", "uint u_interlaced_displayed_field", 56 "bool u_set_mask_while_drawing"}, 57 false); 58 } 59 60 std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool palette, bool uv_limits, 61 bool force_round_texcoords, bool pgxp_depth) 62 { 63 std::stringstream ss; 64 WriteHeader(ss); 65 DefineMacro(ss, "TEXTURED", textured); 66 DefineMacro(ss, "PALETTE", palette); 67 DefineMacro(ss, "UV_LIMITS", uv_limits); 68 DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords); 69 DefineMacro(ss, "PGXP_DEPTH", pgxp_depth); 70 71 WriteCommonFunctions(ss); 72 WriteBatchUniformBuffer(ss); 73 74 if (textured) 75 { 76 if (uv_limits) 77 { 78 DeclareVertexEntryPoint( 79 ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1, 80 {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}, 81 {"nointerpolation", "float4 v_uv_limits"}}, 82 false, "", UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective); 83 } 84 else 85 { 86 DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1, 87 {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, false, "", 88 UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective); 89 } 90 } 91 else 92 { 93 DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0"}, 1, 0, {}, false, "", UsingMSAA(), 94 UsingPerSampleShading(), m_disable_color_perspective); 95 } 96 97 ss << R"( 98 { 99 // Offset the vertex position by 0.5 to ensure correct interpolation of texture coordinates 100 // at 1x resolution scale. This doesn't work at >1x, we adjust the texture coordinates before 101 // uploading there instead. 102 float vertex_offset = (RESOLUTION_SCALE == 1u) ? 0.5 : 0.0; 103 104 // 0..+1023 -> -1..1 105 float pos_x = ((a_pos.x + vertex_offset) / 512.0) - 1.0; 106 float pos_y = ((a_pos.y + vertex_offset) / -256.0) + 1.0; 107 108 #if PGXP_DEPTH 109 // Ignore mask Z when using PGXP depth. 110 float pos_z = a_pos.w; 111 float pos_w = a_pos.w; 112 #else 113 float pos_z = a_pos.z; 114 float pos_w = a_pos.w; 115 #endif 116 117 #if API_OPENGL || API_OPENGL_ES 118 // 0..1 to -1..1 depth range. 119 pos_z = (pos_z * 2.0) - 1.0; 120 #endif 121 122 // NDC space Y flip in Vulkan. 123 #if API_OPENGL || API_OPENGL_ES || API_VULKAN 124 pos_y = -pos_y; 125 #endif 126 127 v_pos = float4(pos_x * pos_w, pos_y * pos_w, pos_z * pos_w, pos_w); 128 129 v_col0 = a_col0; 130 #if TEXTURED 131 v_tex0 = float2(uint2(a_texcoord & 0xFFFFu, a_texcoord >> 16)); 132 #if !PALETTE 133 v_tex0 *= float(RESOLUTION_SCALE); 134 #endif 135 136 // base_x,base_y,palette_x,palette_y 137 v_texpage.x = (a_texpage & 15u) * 64u; 138 v_texpage.y = ((a_texpage >> 4) & 1u) * 256u; 139 #if PALETTE 140 v_texpage.z = ((a_texpage >> 16) & 63u) * 16u; 141 v_texpage.w = ((a_texpage >> 22) & 511u); 142 #endif 143 144 #if UV_LIMITS 145 v_uv_limits = a_uv_limits * 255.0; 146 147 #if FORCE_ROUND_TEXCOORDS && PALETTE 148 // Add 0.5 to the upper bounds when upscaling, to work around interpolation differences. 149 // Limited to force-round-texcoord hack, to avoid breaking other games. 150 v_uv_limits.zw += 0.5; 151 #elif !PALETTE 152 // Treat coordinates as being in upscaled space, and extend the UV range to all "upscaled" 153 // pixels. This means 1-pixel-high polygon-based framebuffer effects won't be downsampled. 154 // (e.g. Mega Man Legends 2 haze effect) 155 v_uv_limits *= float(RESOLUTION_SCALE); 156 v_uv_limits.zw += float(RESOLUTION_SCALE - 1u); 157 #endif 158 #endif 159 #endif 160 } 161 )"; 162 163 return ss.str(); 164 } 165 166 void GPU_HW_ShaderGen::WriteBatchTextureFilter(std::stringstream& ss, GPUTextureFilter texture_filter) 167 { 168 // JINC2 and xBRZ shaders originally from beetle-psx, modified to support filtering mask channel. 169 if (texture_filter == GPUTextureFilter::Bilinear || texture_filter == GPUTextureFilter::BilinearBinAlpha) 170 { 171 DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::BilinearBinAlpha); 172 ss << R"( 173 void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, 174 out float4 texcol, out float ialpha) 175 { 176 // Compute the coordinates of the four texels we will be interpolating between. 177 // Clamp this to the triangle texture coordinates. 178 float2 texel_top_left = frac(coords) - float2(0.5, 0.5); 179 float2 texel_offset = sign(texel_top_left); 180 float4 fcoords = max(coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y), 181 float4(0.0, 0.0, 0.0, 0.0)); 182 183 // Load four texels. 184 float4 s00 = SampleFromVRAM(texpage, clamp(fcoords.xy, uv_limits.xy, uv_limits.zw)); 185 float4 s10 = SampleFromVRAM(texpage, clamp(fcoords.zy, uv_limits.xy, uv_limits.zw)); 186 float4 s01 = SampleFromVRAM(texpage, clamp(fcoords.xw, uv_limits.xy, uv_limits.zw)); 187 float4 s11 = SampleFromVRAM(texpage, clamp(fcoords.zw, uv_limits.xy, uv_limits.zw)); 188 189 // Compute alpha from how many texels aren't pixel color 0000h. 190 float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR)); 191 float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR)); 192 float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR)); 193 float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR)); 194 195 // Bilinearly interpolate. 196 float2 weights = abs(texel_top_left); 197 texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y); 198 ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y); 199 200 // Compensate for partially transparent sampling. 201 if (ialpha > 0.0) 202 texcol.rgb /= float3(ialpha, ialpha, ialpha); 203 204 #if BINALPHA 205 ialpha = (ialpha >= 0.5) ? 1.0 : 0.0; 206 #endif 207 } 208 )"; 209 } 210 else if (texture_filter == GPUTextureFilter::JINC2 || texture_filter == GPUTextureFilter::JINC2BinAlpha) 211 { 212 /* 213 Hyllian's jinc windowed-jinc 2-lobe sharper with anti-ringing Shader 214 215 Copyright (C) 2011-2016 Hyllian/Jararaca - sergiogdb@gmail.com 216 217 Permission is hereby granted, free of charge, to any person obtaining a copy 218 of this software and associated documentation files (the "Software"), to deal 219 in the Software without restriction, including without limitation the rights 220 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 221 copies of the Software, and to permit persons to whom the Software is 222 furnished to do so, subject to the following conditions: 223 224 The above copyright notice and this permission notice shall be included in 225 all copies or substantial portions of the Software. 226 227 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 228 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 229 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 230 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 231 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 232 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 233 THE SOFTWARE. 234 */ 235 DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::JINC2BinAlpha); 236 ss << R"( 237 CONSTANT float JINC2_WINDOW_SINC = 0.44; 238 CONSTANT float JINC2_SINC = 0.82; 239 CONSTANT float JINC2_AR_STRENGTH = 0.8; 240 241 CONSTANT float halfpi = 1.5707963267948966192313216916398; 242 CONSTANT float pi = 3.1415926535897932384626433832795; 243 CONSTANT float wa = 1.382300768; 244 CONSTANT float wb = 2.576105976; 245 246 // Calculates the distance between two points 247 float d(float2 pt1, float2 pt2) 248 { 249 float2 v = pt2 - pt1; 250 return sqrt(dot(v,v)); 251 } 252 253 float min4(float a, float b, float c, float d) 254 { 255 return min(a, min(b, min(c, d))); 256 } 257 258 float4 min4(float4 a, float4 b, float4 c, float4 d) 259 { 260 return min(a, min(b, min(c, d))); 261 } 262 263 float max4(float a, float b, float c, float d) 264 { 265 return max(a, max(b, max(c, d))); 266 } 267 268 float4 max4(float4 a, float4 b, float4 c, float4 d) 269 { 270 return max(a, max(b, max(c, d))); 271 } 272 273 float4 resampler(float4 x) 274 { 275 float4 res; 276 277 // res = (x==float4(0.0, 0.0, 0.0, 0.0)) ? float4(wa*wb) : sin(x*wa)*sin(x*wb)/(x*x); 278 // Need to use mix(.., equal(..)) since we want zero check to be component wise 279 res = lerp(sin(x*wa)*sin(x*wb)/(x*x), float4(wa*wb, wa*wb, wa*wb, wa*wb), VECTOR_COMP_EQ(x,float4(0.0, 0.0, 0.0, 0.0))); 280 281 return res; 282 } 283 284 void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, 285 out float4 texcol, out float ialpha) 286 { 287 float4 weights[4]; 288 289 float2 dx = float2(1.0, 0.0); 290 float2 dy = float2(0.0, 1.0); 291 292 float2 pc = coords.xy; 293 294 float2 tc = (floor(pc-float2(0.5,0.5))+float2(0.5,0.5)); 295 296 weights[0] = resampler(float4(d(pc, tc -dx -dy), d(pc, tc -dy), d(pc, tc +dx -dy), d(pc, tc+2.0*dx -dy))); 297 weights[1] = resampler(float4(d(pc, tc -dx ), d(pc, tc ), d(pc, tc +dx ), d(pc, tc+2.0*dx ))); 298 weights[2] = resampler(float4(d(pc, tc -dx +dy), d(pc, tc +dy), d(pc, tc +dx +dy), d(pc, tc+2.0*dx +dy))); 299 weights[3] = resampler(float4(d(pc, tc -dx+2.0*dy), d(pc, tc +2.0*dy), d(pc, tc +dx+2.0*dy), d(pc, tc+2.0*dx+2.0*dy))); 300 301 dx = dx; 302 dy = dy; 303 tc = tc; 304 305 #define sample_texel(coords) SampleFromVRAM(texpage, clamp((coords), uv_limits.xy, uv_limits.zw)) 306 307 float4 c00 = sample_texel(tc -dx -dy); 308 float a00 = float(VECTOR_NEQ(c00, TRANSPARENT_PIXEL_COLOR)); 309 float4 c10 = sample_texel(tc -dy); 310 float a10 = float(VECTOR_NEQ(c10, TRANSPARENT_PIXEL_COLOR)); 311 float4 c20 = sample_texel(tc +dx -dy); 312 float a20 = float(VECTOR_NEQ(c20, TRANSPARENT_PIXEL_COLOR)); 313 float4 c30 = sample_texel(tc+2.0*dx -dy); 314 float a30 = float(VECTOR_NEQ(c30, TRANSPARENT_PIXEL_COLOR)); 315 float4 c01 = sample_texel(tc -dx ); 316 float a01 = float(VECTOR_NEQ(c01, TRANSPARENT_PIXEL_COLOR)); 317 float4 c11 = sample_texel(tc ); 318 float a11 = float(VECTOR_NEQ(c11, TRANSPARENT_PIXEL_COLOR)); 319 float4 c21 = sample_texel(tc +dx ); 320 float a21 = float(VECTOR_NEQ(c21, TRANSPARENT_PIXEL_COLOR)); 321 float4 c31 = sample_texel(tc+2.0*dx ); 322 float a31 = float(VECTOR_NEQ(c31, TRANSPARENT_PIXEL_COLOR)); 323 float4 c02 = sample_texel(tc -dx +dy); 324 float a02 = float(VECTOR_NEQ(c02, TRANSPARENT_PIXEL_COLOR)); 325 float4 c12 = sample_texel(tc +dy); 326 float a12 = float(VECTOR_NEQ(c12, TRANSPARENT_PIXEL_COLOR)); 327 float4 c22 = sample_texel(tc +dx +dy); 328 float a22 = float(VECTOR_NEQ(c22, TRANSPARENT_PIXEL_COLOR)); 329 float4 c32 = sample_texel(tc+2.0*dx +dy); 330 float a32 = float(VECTOR_NEQ(c32, TRANSPARENT_PIXEL_COLOR)); 331 float4 c03 = sample_texel(tc -dx+2.0*dy); 332 float a03 = float(VECTOR_NEQ(c03, TRANSPARENT_PIXEL_COLOR)); 333 float4 c13 = sample_texel(tc +2.0*dy); 334 float a13 = float(VECTOR_NEQ(c13, TRANSPARENT_PIXEL_COLOR)); 335 float4 c23 = sample_texel(tc +dx+2.0*dy); 336 float a23 = float(VECTOR_NEQ(c23, TRANSPARENT_PIXEL_COLOR)); 337 float4 c33 = sample_texel(tc+2.0*dx+2.0*dy); 338 float a33 = float(VECTOR_NEQ(c33, TRANSPARENT_PIXEL_COLOR)); 339 340 #undef sample_texel 341 342 // Get min/max samples 343 float4 min_sample = min4(c11, c21, c12, c22); 344 float min_sample_alpha = min4(a11, a21, a12, a22); 345 float4 max_sample = max4(c11, c21, c12, c22); 346 float max_sample_alpha = max4(a11, a21, a12, a22); 347 348 float4 color; 349 color = float4(dot(weights[0], float4(c00.x, c10.x, c20.x, c30.x)), dot(weights[0], float4(c00.y, c10.y, c20.y, c30.y)), dot(weights[0], float4(c00.z, c10.z, c20.z, c30.z)), dot(weights[0], float4(c00.w, c10.w, c20.w, c30.w))); 350 color+= float4(dot(weights[1], float4(c01.x, c11.x, c21.x, c31.x)), dot(weights[1], float4(c01.y, c11.y, c21.y, c31.y)), dot(weights[1], float4(c01.z, c11.z, c21.z, c31.z)), dot(weights[1], float4(c01.w, c11.w, c21.w, c31.w))); 351 color+= float4(dot(weights[2], float4(c02.x, c12.x, c22.x, c32.x)), dot(weights[2], float4(c02.y, c12.y, c22.y, c32.y)), dot(weights[2], float4(c02.z, c12.z, c22.z, c32.z)), dot(weights[2], float4(c02.w, c12.w, c22.w, c32.w))); 352 color+= float4(dot(weights[3], float4(c03.x, c13.x, c23.x, c33.x)), dot(weights[3], float4(c03.y, c13.y, c23.y, c33.y)), dot(weights[3], float4(c03.z, c13.z, c23.z, c33.z)), dot(weights[3], float4(c03.w, c13.w, c23.w, c33.w))); 353 color = color/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1))); 354 355 float alpha; 356 alpha = dot(weights[0], float4(a00, a10, a20, a30)); 357 alpha+= dot(weights[1], float4(a01, a11, a21, a31)); 358 alpha+= dot(weights[2], float4(a02, a12, a22, a32)); 359 alpha+= dot(weights[3], float4(a03, a13, a23, a33)); 360 //alpha = alpha/(weights[0].w + weights[1].w + weights[2].w + weights[3].w); 361 alpha = alpha/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1))); 362 363 // Anti-ringing 364 float4 aux = color; 365 float aux_alpha = alpha; 366 color = clamp(color, min_sample, max_sample); 367 alpha = clamp(alpha, min_sample_alpha, max_sample_alpha); 368 color = lerp(aux, color, JINC2_AR_STRENGTH); 369 alpha = lerp(aux_alpha, alpha, JINC2_AR_STRENGTH); 370 371 // final sum and weight normalization 372 ialpha = alpha; 373 texcol = color; 374 375 // Compensate for partially transparent sampling. 376 if (ialpha > 0.0) 377 texcol.rgb /= float3(ialpha, ialpha, ialpha); 378 379 #if BINALPHA 380 ialpha = (ialpha >= 0.5) ? 1.0 : 0.0; 381 #endif 382 } 383 )"; 384 } 385 else if (texture_filter == GPUTextureFilter::xBR || texture_filter == GPUTextureFilter::xBRBinAlpha) 386 { 387 /* 388 Hyllian's xBR-vertex code and texel mapping 389 390 Copyright (C) 2011/2016 Hyllian - sergiogdb@gmail.com 391 392 Permission is hereby granted, free of charge, to any person obtaining a copy 393 of this software and associated documentation files (the "Software"), to deal 394 in the Software without restriction, including without limitation the rights 395 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 396 copies of the Software, and to permit persons to whom the Software is 397 furnished to do so, subject to the following conditions: 398 399 The above copyright notice and this permission notice shall be included in 400 all copies or substantial portions of the Software. 401 402 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 403 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 404 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 405 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 406 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 407 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 408 THE SOFTWARE. 409 */ 410 411 DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::xBRBinAlpha); 412 ss << R"( 413 CONSTANT int BLEND_NONE = 0; 414 CONSTANT int BLEND_NORMAL = 1; 415 CONSTANT int BLEND_DOMINANT = 2; 416 CONSTANT float LUMINANCE_WEIGHT = 1.0; 417 CONSTANT float EQUAL_COLOR_TOLERANCE = 0.1176470588235294; 418 CONSTANT float STEEP_DIRECTION_THRESHOLD = 2.2; 419 CONSTANT float DOMINANT_DIRECTION_THRESHOLD = 3.6; 420 CONSTANT float4 w = float4(0.2627, 0.6780, 0.0593, 0.5); 421 422 float DistYCbCr(float4 pixA, float4 pixB) 423 { 424 const float scaleB = 0.5 / (1.0 - w.b); 425 const float scaleR = 0.5 / (1.0 - w.r); 426 float4 diff = pixA - pixB; 427 float Y = dot(diff, w); 428 float Cb = scaleB * (diff.b - Y); 429 float Cr = scaleR * (diff.r - Y); 430 431 return sqrt(((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr)); 432 } 433 434 bool IsPixEqual(const float4 pixA, const float4 pixB) 435 { 436 return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE); 437 } 438 439 float get_left_ratio(float2 center, float2 origin, float2 direction, float2 scale) 440 { 441 float2 P0 = center - origin; 442 float2 proj = direction * (dot(P0, direction) / dot(direction, direction)); 443 float2 distv = P0 - proj; 444 float2 orth = float2(-direction.y, direction.x); 445 float side = sign(dot(P0, orth)); 446 float v = side * length(distv * scale); 447 448 // return step(0, v); 449 return smoothstep(-sqrt(2.0)/2.0, sqrt(2.0)/2.0, v); 450 } 451 452 #define P(coord, xoffs, yoffs) SampleFromVRAM(texpage, clamp(coords + float2((xoffs), (yoffs)), uv_limits.xy, uv_limits.zw)) 453 454 void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, 455 out float4 texcol, out float ialpha) 456 { 457 //--------------------------------------- 458 // Input Pixel Mapping: -|x|x|x|- 459 // x|A|B|C|x 460 // x|D|E|F|x 461 // x|G|H|I|x 462 // -|x|x|x|- 463 464 float2 scale = float2(8.0, 8.0); 465 float2 pos = frac(coords.xy) - float2(0.5, 0.5); 466 float2 coord = coords.xy - pos; 467 468 float4 A = P(coord, -1,-1); 469 float Aw = A.w; 470 A.w = float(VECTOR_NEQ(A, TRANSPARENT_PIXEL_COLOR)); 471 float4 B = P(coord, 0,-1); 472 float Bw = B.w; 473 B.w = float(VECTOR_NEQ(B, TRANSPARENT_PIXEL_COLOR)); 474 float4 C = P(coord, 1,-1); 475 float Cw = C.w; 476 C.w = float(VECTOR_NEQ(C, TRANSPARENT_PIXEL_COLOR)); 477 float4 D = P(coord, -1, 0); 478 float Dw = D.w; 479 D.w = float(VECTOR_NEQ(D, TRANSPARENT_PIXEL_COLOR)); 480 float4 E = P(coord, 0, 0); 481 float Ew = E.w; 482 E.w = float(VECTOR_NEQ(E, TRANSPARENT_PIXEL_COLOR)); 483 float4 F = P(coord, 1, 0); 484 float Fw = F.w; 485 F.w = float(VECTOR_NEQ(F, TRANSPARENT_PIXEL_COLOR)); 486 float4 G = P(coord, -1, 1); 487 float Gw = G.w; 488 G.w = float(VECTOR_NEQ(G, TRANSPARENT_PIXEL_COLOR)); 489 float4 H = P(coord, 0, 1); 490 float Hw = H.w; 491 H.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR)); 492 float4 I = P(coord, 1, 1); 493 float Iw = I.w; 494 I.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR)); 495 496 // blendResult Mapping: x|y| 497 // w|z| 498 int4 blendResult = int4(BLEND_NONE,BLEND_NONE,BLEND_NONE,BLEND_NONE); 499 500 // Preprocess corners 501 // Pixel Tap Mapping: -|-|-|-|- 502 // -|-|B|C|- 503 // -|D|E|F|x 504 // -|G|H|I|x 505 // -|-|x|x|- 506 if (!((VECTOR_EQ(E,F) && VECTOR_EQ(H,I)) || (VECTOR_EQ(E,H) && VECTOR_EQ(F,I)))) 507 { 508 float dist_H_F = DistYCbCr(G, E) + DistYCbCr(E, C) + DistYCbCr(P(coord, 0,2), I) + DistYCbCr(I, P(coord, 2,0)) + (4.0 * DistYCbCr(H, F)); 509 float dist_E_I = DistYCbCr(D, H) + DistYCbCr(H, P(coord, 1,2)) + DistYCbCr(B, F) + DistYCbCr(F, P(coord, 2,1)) + (4.0 * DistYCbCr(E, I)); 510 bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_H_F) < dist_E_I; 511 blendResult.z = ((dist_H_F < dist_E_I) && VECTOR_NEQ(E,F) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE; 512 } 513 514 515 // Pixel Tap Mapping: -|-|-|-|- 516 // -|A|B|-|- 517 // x|D|E|F|- 518 // x|G|H|I|- 519 // -|x|x|-|- 520 if (!((VECTOR_EQ(D,E) && VECTOR_EQ(G,H)) || (VECTOR_EQ(D,G) && VECTOR_EQ(E,H)))) 521 { 522 float dist_G_E = DistYCbCr(P(coord, -2,1) , D) + DistYCbCr(D, B) + DistYCbCr(P(coord, -1,2), H) + DistYCbCr(H, F) + (4.0 * DistYCbCr(G, E)); 523 float dist_D_H = DistYCbCr(P(coord, -2,0) , G) + DistYCbCr(G, P(coord, 0,2)) + DistYCbCr(A, E) + DistYCbCr(E, I) + (4.0 * DistYCbCr(D, H)); 524 bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_H) < dist_G_E; 525 blendResult.w = ((dist_G_E > dist_D_H) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE; 526 } 527 528 // Pixel Tap Mapping: -|-|x|x|- 529 // -|A|B|C|x 530 // -|D|E|F|x 531 // -|-|H|I|- 532 // -|-|-|-|- 533 if (!((VECTOR_EQ(B,C) && VECTOR_EQ(E,F)) || (VECTOR_EQ(B,E) && VECTOR_EQ(C,F)))) 534 { 535 float dist_E_C = DistYCbCr(D, B) + DistYCbCr(B, P(coord, 1,-2)) + DistYCbCr(H, F) + DistYCbCr(F, P(coord, 2,-1)) + (4.0 * DistYCbCr(E, C)); 536 float dist_B_F = DistYCbCr(A, E) + DistYCbCr(E, I) + DistYCbCr(P(coord, 0,-2), C) + DistYCbCr(C, P(coord, 2,0)) + (4.0 * DistYCbCr(B, F)); 537 bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_B_F) < dist_E_C; 538 blendResult.y = ((dist_E_C > dist_B_F) && VECTOR_NEQ(E,B) && VECTOR_NEQ(E,F)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE; 539 } 540 541 // Pixel Tap Mapping: -|x|x|-|- 542 // x|A|B|C|- 543 // x|D|E|F|- 544 // -|G|H|-|- 545 // -|-|-|-|- 546 if (!((VECTOR_EQ(A,B) && VECTOR_EQ(D,E)) || (VECTOR_EQ(A,D) && VECTOR_EQ(B,E)))) 547 { 548 float dist_D_B = DistYCbCr(P(coord, -2,0), A) + DistYCbCr(A, P(coord, 0,-2)) + DistYCbCr(G, E) + DistYCbCr(E, C) + (4.0 * DistYCbCr(D, B)); 549 float dist_A_E = DistYCbCr(P(coord, -2,-1), D) + DistYCbCr(D, H) + DistYCbCr(P(coord, -1,-2), B) + DistYCbCr(B, F) + (4.0 * DistYCbCr(A, E)); 550 bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_B) < dist_A_E; 551 blendResult.x = ((dist_D_B < dist_A_E) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,B)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE; 552 } 553 554 float4 res = E; 555 float resW = Ew; 556 557 // Pixel Tap Mapping: -|-|-|-|- 558 // -|-|B|C|- 559 // -|D|E|F|x 560 // -|G|H|I|x 561 // -|-|x|x|- 562 if(blendResult.z != BLEND_NONE) 563 { 564 float dist_F_G = DistYCbCr(F, G); 565 float dist_H_C = DistYCbCr(H, C); 566 bool doLineBlend = (blendResult.z == BLEND_DOMINANT || 567 !((blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) || (blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) || 568 (IsPixEqual(G, H) && IsPixEqual(H, I) && IsPixEqual(I, F) && IsPixEqual(F, C) && !IsPixEqual(E, I)))); 569 570 float2 origin = float2(0.0, 1.0 / sqrt(2.0)); 571 float2 direction = float2(1.0, -1.0); 572 if(doLineBlend) 573 { 574 bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_F_G <= dist_H_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(D,G); 575 bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_H_C <= dist_F_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(B,C); 576 origin = haveShallowLine? float2(0.0, 0.25) : float2(0.0, 0.5); 577 direction.x += haveShallowLine? 1.0: 0.0; 578 direction.y -= haveSteepLine? 1.0: 0.0; 579 } 580 581 float4 blendPix = lerp(H,F, step(DistYCbCr(E, F), DistYCbCr(E, H))); 582 float blendW = lerp(Hw,Fw, step(DistYCbCr(E, F), DistYCbCr(E, H))); 583 res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale)); 584 resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale)); 585 } 586 587 // Pixel Tap Mapping: -|-|-|-|- 588 // -|A|B|-|- 589 // x|D|E|F|- 590 // x|G|H|I|- 591 // -|x|x|-|- 592 if(blendResult.w != BLEND_NONE) 593 { 594 float dist_H_A = DistYCbCr(H, A); 595 float dist_D_I = DistYCbCr(D, I); 596 bool doLineBlend = (blendResult.w == BLEND_DOMINANT || 597 !((blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) || (blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) || 598 (IsPixEqual(A, D) && IsPixEqual(D, G) && IsPixEqual(G, H) && IsPixEqual(H, I) && !IsPixEqual(E, G)))); 599 600 float2 origin = float2(-1.0 / sqrt(2.0), 0.0); 601 float2 direction = float2(1.0, 1.0); 602 if(doLineBlend) 603 { 604 bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_H_A <= dist_D_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(B,A); 605 bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_D_I <= dist_H_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(F,I); 606 origin = haveShallowLine? float2(-0.25, 0.0) : float2(-0.5, 0.0); 607 direction.y += haveShallowLine? 1.0: 0.0; 608 direction.x += haveSteepLine? 1.0: 0.0; 609 } 610 origin = origin; 611 direction = direction; 612 613 float4 blendPix = lerp(H,D, step(DistYCbCr(E, D), DistYCbCr(E, H))); 614 float blendW = lerp(Hw,Dw, step(DistYCbCr(E, D), DistYCbCr(E, H))); 615 res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale)); 616 resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale)); 617 } 618 619 // Pixel Tap Mapping: -|-|x|x|- 620 // -|A|B|C|x 621 // -|D|E|F|x 622 // -|-|H|I|- 623 // -|-|-|-|- 624 if(blendResult.y != BLEND_NONE) 625 { 626 float dist_B_I = DistYCbCr(B, I); 627 float dist_F_A = DistYCbCr(F, A); 628 bool doLineBlend = (blendResult.y == BLEND_DOMINANT || 629 !((blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) || (blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) || 630 (IsPixEqual(I, F) && IsPixEqual(F, C) && IsPixEqual(C, B) && IsPixEqual(B, A) && !IsPixEqual(E, C)))); 631 632 float2 origin = float2(1.0 / sqrt(2.0), 0.0); 633 float2 direction = float2(-1.0, -1.0); 634 635 if(doLineBlend) 636 { 637 bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_B_I <= dist_F_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(H,I); 638 bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_F_A <= dist_B_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(D,A); 639 origin = haveShallowLine? float2(0.25, 0.0) : float2(0.5, 0.0); 640 direction.y -= haveShallowLine? 1.0: 0.0; 641 direction.x -= haveSteepLine? 1.0: 0.0; 642 } 643 644 float4 blendPix = lerp(F,B, step(DistYCbCr(E, B), DistYCbCr(E, F))); 645 float blendW = lerp(Fw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, F))); 646 res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale)); 647 resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale)); 648 } 649 650 // Pixel Tap Mapping: -|x|x|-|- 651 // x|A|B|C|- 652 // x|D|E|F|- 653 // -|G|H|-|- 654 // -|-|-|-|- 655 if(blendResult.x != BLEND_NONE) 656 { 657 float dist_D_C = DistYCbCr(D, C); 658 float dist_B_G = DistYCbCr(B, G); 659 bool doLineBlend = (blendResult.x == BLEND_DOMINANT || 660 !((blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) || (blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) || 661 (IsPixEqual(C, B) && IsPixEqual(B, A) && IsPixEqual(A, D) && IsPixEqual(D, G) && !IsPixEqual(E, A)))); 662 663 float2 origin = float2(0.0, -1.0 / sqrt(2.0)); 664 float2 direction = float2(-1.0, 1.0); 665 if(doLineBlend) 666 { 667 bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_D_C <= dist_B_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(F,C); 668 bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_B_G <= dist_D_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(H,G); 669 origin = haveShallowLine? float2(0.0, -0.25) : float2(0.0, -0.5); 670 direction.x -= haveShallowLine? 1.0: 0.0; 671 direction.y += haveSteepLine? 1.0: 0.0; 672 } 673 674 float4 blendPix = lerp(D,B, step(DistYCbCr(E, B), DistYCbCr(E, D))); 675 float blendW = lerp(Dw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, D))); 676 res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale)); 677 resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale)); 678 } 679 680 ialpha = res.w; 681 texcol = float4(res.xyz, resW); 682 683 // Compensate for partially transparent sampling. 684 if (ialpha > 0.0) 685 texcol.rgb /= float3(ialpha, ialpha, ialpha); 686 687 #if BINALPHA 688 ialpha = (ialpha >= 0.5) ? 1.0 : 0.0; 689 #endif 690 } 691 692 #undef P 693 694 )"; 695 } 696 } 697 698 std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader( 699 GPU_HW::BatchRenderMode render_mode, GPUTransparencyMode transparency, GPU_HW::BatchTextureMode texture_mode, 700 GPUTextureFilter texture_filtering, bool uv_limits, bool force_round_texcoords, bool dithering, bool interlacing, 701 bool check_mask, bool use_rov, bool use_rov_depth, bool rov_depth_test) 702 { 703 // TODO: don't write depth for shader blend 704 DebugAssert(transparency == GPUTransparencyMode::Disabled || render_mode == GPU_HW::BatchRenderMode::ShaderBlend); 705 DebugAssert(!rov_depth_test || (use_rov && use_rov_depth)); 706 707 const bool textured = (texture_mode != GPU_HW::BatchTextureMode::Disabled); 708 const bool palette = 709 (texture_mode == GPU_HW::BatchTextureMode::Palette4Bit || texture_mode == GPU_HW::BatchTextureMode::Palette8Bit); 710 const bool shader_blending = (render_mode == GPU_HW::BatchRenderMode::ShaderBlend); 711 const bool use_dual_source = (!shader_blending && !use_rov && m_supports_dual_source_blend && 712 ((render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled && 713 render_mode != GPU_HW::BatchRenderMode::OnlyOpaque) || 714 texture_filtering != GPUTextureFilter::Nearest)); 715 716 std::stringstream ss; 717 WriteHeader(ss, use_rov); 718 DefineMacro(ss, "TRANSPARENCY", render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled); 719 DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", render_mode == GPU_HW::BatchRenderMode::OnlyOpaque); 720 DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENT", render_mode == GPU_HW::BatchRenderMode::OnlyTransparent); 721 DefineMacro(ss, "TRANSPARENCY_MODE", static_cast<s32>(transparency)); 722 DefineMacro(ss, "SHADER_BLENDING", shader_blending); 723 DefineMacro(ss, "CHECK_MASK_BIT", check_mask); 724 DefineMacro(ss, "TEXTURED", textured); 725 DefineMacro(ss, "PALETTE", palette); 726 DefineMacro(ss, "PALETTE_4_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette4Bit); 727 DefineMacro(ss, "PALETTE_8_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette8Bit); 728 DefineMacro(ss, "DITHERING", dithering); 729 DefineMacro(ss, "DITHERING_SCALED", m_scaled_dithering); 730 // Debanding requires true color to work correctly. 731 DefineMacro(ss, "DEBANDING", m_true_color && m_debanding); 732 DefineMacro(ss, "INTERLACING", interlacing); 733 DefineMacro(ss, "TRUE_COLOR", m_true_color); 734 DefineMacro(ss, "TEXTURE_FILTERING", texture_filtering != GPUTextureFilter::Nearest); 735 DefineMacro(ss, "UV_LIMITS", uv_limits); 736 DefineMacro(ss, "USE_ROV", use_rov); 737 DefineMacro(ss, "USE_ROV_DEPTH", use_rov_depth); 738 DefineMacro(ss, "ROV_DEPTH_TEST", rov_depth_test); 739 DefineMacro(ss, "USE_DUAL_SOURCE", use_dual_source); 740 DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); 741 DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords); 742 DefineMacro(ss, "UPSCALED", m_resolution_scale > 1); 743 744 WriteCommonFunctions(ss); 745 WriteBatchUniformBuffer(ss); 746 DeclareTexture(ss, "samp0", 0); 747 748 if (use_rov) 749 { 750 DeclareImage(ss, "rov_color", 0); 751 if (use_rov_depth) 752 DeclareImage(ss, "rov_depth", 1, true); 753 } 754 755 if (m_glsl) 756 ss << "CONSTANT int[16] s_dither_values = int[16]( "; 757 else 758 ss << "CONSTANT int s_dither_values[] = {"; 759 for (u32 i = 0; i < 16; i++) 760 { 761 if (i > 0) 762 ss << ", "; 763 ss << DITHER_MATRIX[i / 4][i % 4]; 764 } 765 if (m_glsl) 766 ss << " );\n"; 767 else 768 ss << "};\n"; 769 770 ss << R"( 771 uint3 ApplyDithering(uint2 coord, uint3 icol) 772 { 773 #if DITHERING_SCALED 774 uint2 fc = coord & uint2(3u, 3u); 775 #else 776 uint2 fc = (coord / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & uint2(3u, 3u); 777 #endif 778 int offset = s_dither_values[fc.y * 4u + fc.x]; 779 780 #if !TRUE_COLOR 781 return uint3(clamp((int3(icol) + int3(offset, offset, offset)) >> 3, 0, 31)); 782 #else 783 return uint3(clamp(int3(icol) + int3(offset, offset, offset), 0, 255)); 784 #endif 785 } 786 787 #if TEXTURED 788 CONSTANT float4 TRANSPARENT_PIXEL_COLOR = float4(0.0, 0.0, 0.0, 0.0); 789 790 #if PALETTE 791 #define TEXPAGE_VALUE uint4 792 #else 793 #define TEXPAGE_VALUE uint2 794 #endif 795 796 uint2 ApplyTextureWindow(uint2 coords) 797 { 798 uint x = (uint(coords.x) & u_texture_window_and.x) | u_texture_window_or.x; 799 uint y = (uint(coords.y) & u_texture_window_and.y) | u_texture_window_or.y; 800 return uint2(x, y); 801 } 802 803 uint2 FloatToIntegerCoords(float2 coords) 804 { 805 // With the vertex offset applied at 1x resolution scale, we want to round the texture coordinates. 806 // Floor them otherwise, as it currently breaks when upscaling as the vertex offset is not applied. 807 return uint2((RESOLUTION_SCALE == 1u || FORCE_ROUND_TEXCOORDS != 0) ? roundEven(coords) : floor(coords)); 808 } 809 810 float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords) 811 { 812 #if PALETTE 813 uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords)); 814 815 uint2 vicoord; 816 #if PALETTE_4_BIT 817 // 4bit will never wrap, since it's in the last texpage row. 818 vicoord = uint2(texpage.x + (icoord.x / 4u), texpage.y + icoord.y); 819 #elif PALETTE_8_BIT 820 // 8bit can wrap in the X direction. 821 vicoord = uint2((texpage.x + (icoord.x / 2u)) & 0x3FFu, texpage.y + icoord.y); 822 #endif 823 824 // load colour/palette 825 float4 texel = LOAD_TEXTURE(samp0, int2(vicoord * RESOLUTION_SCALE), 0); 826 uint vram_value = RGBA8ToRGBA5551(texel); 827 828 // apply palette 829 #if PALETTE_4_BIT 830 uint subpixel = icoord.x & 3u; 831 uint palette_index = (vram_value >> (subpixel * 4u)) & 0x0Fu; 832 uint2 palette_icoord = uint2((texpage.z + palette_index), texpage.w); 833 #elif PALETTE_8_BIT 834 // can only wrap in X direction for 8-bit, 4-bit will fit in texpage size. 835 uint subpixel = icoord.x & 1u; 836 uint palette_index = (vram_value >> (subpixel * 8u)) & 0xFFu; 837 uint2 palette_icoord = uint2(((texpage.z + palette_index) & 0x3FFu), texpage.w); 838 #endif 839 840 return LOAD_TEXTURE(samp0, int2(palette_icoord * RESOLUTION_SCALE), 0); 841 #else 842 // Direct texturing - usually render-to-texture effects. 843 uint2 vicoord; 844 #if !UPSCALED 845 uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords)); 846 vicoord = (texpage.xy + icoord) & uint2(1023, 511); 847 #else 848 // Coordinates are already upscaled, we need to downscale them to apply the texture 849 // window, then re-upscale/offset. We can't round here, because it could result in 850 // going outside of the texture window. 851 float2 ncoords = coords / float(RESOLUTION_SCALE); 852 float2 nfpart = frac(ncoords); 853 uint2 nicoord = ApplyTextureWindow(uint2(floor(ncoords))); 854 uint2 nvicoord = (texpage.xy + nicoord) & uint2(1023, 511); 855 coords = (float2(nvicoord) + nfpart) * float(RESOLUTION_SCALE); 856 vicoord = uint2(floor(coords)); 857 #endif 858 859 return LOAD_TEXTURE(samp0, int2(vicoord), 0); 860 #endif 861 } 862 863 #endif 864 865 // From https://alex.vlachos.com/graphics/Alex_Vlachos_Advanced_VR_Rendering_GDC2015.pdf 866 // and https://www.shadertoy.com/view/MslGR8 (5th one starting from the bottom) 867 // NOTE: `frag_coord` is in pixels (i.e. not normalized UV). 868 float3 ApplyDebanding(float2 frag_coord) 869 { 870 #if DEBANDING 871 // Iestyn's RGB dither (7 asm instructions) from Portal 2 X360, slightly modified for VR. 872 float ditherc = dot(vec2(171.0, 231.0), frag_coord); 873 float3 dither = float3(ditherc, ditherc, ditherc); 874 dither = fract(dither / float3(103.0, 71.0, 97.0)); 875 876 // Subtract 0.5 to avoid slightly brightening the whole viewport. 877 return (dither - 0.5) / 255.0; 878 #else 879 return float3(0.0, 0.0, 0.0); 880 #endif 881 } 882 )"; 883 884 const u32 num_fragment_outputs = use_rov ? 0 : (use_dual_source ? 2 : 1); 885 if (textured) 886 { 887 if (texture_filtering != GPUTextureFilter::Nearest) 888 WriteBatchTextureFilter(ss, texture_filtering); 889 890 if (uv_limits) 891 { 892 DeclareFragmentEntryPoint(ss, 1, 1, 893 {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}, 894 {"nointerpolation", "float4 v_uv_limits"}}, 895 true, num_fragment_outputs, use_dual_source, m_write_mask_as_depth, UsingMSAA(), 896 UsingPerSampleShading(), false, m_disable_color_perspective, 897 shader_blending && !use_rov, use_rov); 898 } 899 else 900 { 901 DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, true, 902 num_fragment_outputs, use_dual_source, m_write_mask_as_depth, UsingMSAA(), 903 UsingPerSampleShading(), false, m_disable_color_perspective, 904 shader_blending && !use_rov, use_rov); 905 } 906 } 907 else 908 { 909 DeclareFragmentEntryPoint(ss, 1, 0, {}, true, num_fragment_outputs, use_dual_source, m_write_mask_as_depth, 910 UsingMSAA(), UsingPerSampleShading(), false, m_disable_color_perspective, 911 shader_blending && !use_rov, use_rov); 912 } 913 914 ss << R"( 915 { 916 uint3 vertcol = uint3(v_col0.rgb * float3(255.0, 255.0, 255.0) + ApplyDebanding(v_pos.xy)); 917 uint2 fragpos = uint2(v_pos.xy); 918 919 bool semitransparent; 920 uint3 icolor; 921 float ialpha; 922 float oalpha; 923 924 #if INTERLACING 925 if ((fragpos.y & 1u) == u_interlaced_displayed_field) 926 discard; 927 #endif 928 929 #if TEXTURED 930 float4 texcol; 931 #if TEXTURE_FILTERING 932 FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha); 933 if (ialpha < 0.5) 934 discard; 935 #else 936 #if UV_LIMITS 937 texcol = SampleFromVRAM(v_texpage, clamp(v_tex0, v_uv_limits.xy, v_uv_limits.zw)); 938 #else 939 texcol = SampleFromVRAM(v_texpage, v_tex0); 940 #endif 941 if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR)) 942 discard; 943 944 ialpha = 1.0; 945 #endif 946 947 semitransparent = (texcol.a >= 0.5); 948 949 // If not using true color, truncate the framebuffer colors to 5-bit. 950 #if !TRUE_COLOR 951 icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3; 952 icolor = (icolor * vertcol) >> 4; 953 #if DITHERING 954 icolor = ApplyDithering(fragpos, icolor); 955 #else 956 icolor = min(icolor >> 3, uint3(31u, 31u, 31u)); 957 #endif 958 #else 959 icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0) + ApplyDebanding(v_pos.xy)); 960 icolor = (icolor * vertcol) >> 7; 961 #if DITHERING 962 icolor = ApplyDithering(fragpos, icolor); 963 #else 964 icolor = min(icolor, uint3(255u, 255u, 255u)); 965 #endif 966 #endif 967 968 // Compute output alpha (mask bit) 969 oalpha = float(u_set_mask_while_drawing ? 1 : int(semitransparent)); 970 #else 971 // All pixels are semitransparent for untextured polygons. 972 semitransparent = true; 973 icolor = vertcol; 974 ialpha = 1.0; 975 976 #if DITHERING 977 icolor = ApplyDithering(fragpos, icolor); 978 #else 979 #if !TRUE_COLOR 980 icolor >>= 3; 981 #endif 982 #endif 983 984 // However, the mask bit is cleared if set mask bit is false. 985 oalpha = float(u_set_mask_while_drawing); 986 #endif 987 988 #if SHADER_BLENDING 989 #if USE_ROV 990 BEGIN_ROV_REGION; 991 float4 bg_col = ROV_LOAD(rov_color, fragpos); 992 float4 o_col0; 993 bool discarded = false; 994 995 #if ROV_DEPTH_TEST 996 float bg_depth = ROV_LOAD(rov_depth, fragpos).r; 997 discarded = (v_pos.z > bg_depth); 998 #endif 999 #if CHECK_MASK_BIT 1000 discarded = discarded || (bg_col.a != 0.0); 1001 #endif 1002 #else 1003 float4 bg_col = LAST_FRAG_COLOR; 1004 #if CHECK_MASK_BIT 1005 if (bg_col.a != 0.0) 1006 discard; 1007 #endif 1008 #endif 1009 1010 // Work in normalized space for true colour, matches HW blend. 1011 float4 fg_col = float4(float3(icolor), oalpha); 1012 #if TRUE_COLOR 1013 fg_col.rgb /= 255.0; 1014 #elif TRANSPARENCY // rgb not used in check-mask only 1015 bg_col.rgb = roundEven(bg_col.rgb * 31.0); 1016 #endif 1017 1018 #if TEXTURE_FILTERING 1019 #if TRANSPARENCY_MODE == 0 || TRANSPARENCY_MODE == 3 1020 bg_col.rgb /= ialpha; 1021 #endif 1022 fg_col.rgb *= ialpha; 1023 #endif 1024 1025 o_col0.a = fg_col.a; 1026 #if TRANSPARENCY_MODE == 0 // Half BG + Half FG. 1027 o_col0.rgb = (bg_col.rgb * 0.5) + (fg_col.rgb * 0.5); 1028 #elif TRANSPARENCY_MODE == 1 // BG + FG 1029 o_col0.rgb = bg_col.rgb + fg_col.rgb; 1030 #elif TRANSPARENCY_MODE == 2 // BG - FG 1031 o_col0.rgb = bg_col.rgb - fg_col.rgb; 1032 #elif TRANSPARENCY_MODE == 3 // BG + 1/4 FG. 1033 o_col0.rgb = bg_col.rgb + (fg_col.rgb * 0.25); 1034 #else 1035 o_col0.rgb = fg_col.rgb; 1036 #endif 1037 1038 // 16-bit truncation. 1039 #if !TRUE_COLOR && TRANSPARENCY 1040 o_col0.rgb = floor(o_col0.rgb); 1041 #endif 1042 1043 #if TRANSPARENCY 1044 // If pixel isn't marked as semitransparent, replace with previous colour. 1045 o_col0 = semitransparent ? o_col0 : fg_col; 1046 #endif 1047 1048 // Normalize for non-true-color. 1049 #if !TRUE_COLOR 1050 o_col0.rgb /= 31.0; 1051 #endif 1052 1053 #if USE_ROV 1054 if (!discarded) 1055 { 1056 ROV_STORE(rov_color, fragpos, o_col0); 1057 #if USE_ROV_DEPTH 1058 ROV_STORE(rov_depth, fragpos, float4(v_pos.z, 0.0, 0.0, 0.0)); 1059 #endif 1060 } 1061 END_ROV_REGION; 1062 #endif 1063 #else 1064 // Premultiply alpha so we don't need to use a colour output for it. 1065 float premultiply_alpha = ialpha; 1066 #if TRANSPARENCY 1067 premultiply_alpha = ialpha * (semitransparent ? u_src_alpha_factor : 1.0); 1068 #endif 1069 1070 float3 color; 1071 #if !TRUE_COLOR 1072 // We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color 1073 // into the blend unit, which can cause a small amount of error to accumulate. 1074 color = floor(float3(icolor) * premultiply_alpha) / 31.0; 1075 #else 1076 // True color is actually simpler here since we want to preserve the precision. 1077 color = (float3(icolor) * premultiply_alpha) / 255.0; 1078 #endif 1079 1080 #if TRANSPARENCY && TEXTURED 1081 // Apply semitransparency. If not a semitransparent texel, destination alpha is ignored. 1082 if (semitransparent) 1083 { 1084 #if USE_DUAL_SOURCE 1085 o_col0 = float4(color, oalpha); 1086 o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha); 1087 #else 1088 o_col0 = float4(color, oalpha); 1089 #endif 1090 1091 #if WRITE_MASK_AS_DEPTH 1092 o_depth = oalpha * v_pos.z; 1093 #endif 1094 1095 #if TRANSPARENCY_ONLY_OPAQUE 1096 discard; 1097 #endif 1098 } 1099 else 1100 { 1101 #if USE_DUAL_SOURCE 1102 o_col0 = float4(color, oalpha); 1103 o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha); 1104 #else 1105 o_col0 = float4(color, oalpha); 1106 #endif 1107 1108 #if WRITE_MASK_AS_DEPTH 1109 o_depth = oalpha * v_pos.z; 1110 #endif 1111 1112 #if TRANSPARENCY_ONLY_TRANSPARENT 1113 discard; 1114 #endif 1115 } 1116 #elif TRANSPARENCY 1117 // We shouldn't be rendering opaque geometry only when untextured, so no need to test/discard here. 1118 #if USE_DUAL_SOURCE 1119 o_col0 = float4(color, oalpha); 1120 o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha); 1121 #else 1122 o_col0 = float4(color, oalpha); 1123 #endif 1124 1125 #if WRITE_MASK_AS_DEPTH 1126 o_depth = oalpha * v_pos.z; 1127 #endif 1128 #else 1129 // Non-transparency won't enable blending so we can write the mask here regardless. 1130 o_col0 = float4(color, oalpha); 1131 1132 #if USE_DUAL_SOURCE 1133 o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha); 1134 #endif 1135 1136 #if WRITE_MASK_AS_DEPTH 1137 o_depth = oalpha * v_pos.z; 1138 #endif 1139 #endif 1140 #endif 1141 } 1142 )"; 1143 1144 return ss.str(); 1145 } 1146 1147 std::string GPU_HW_ShaderGen::GenerateVRAMExtractFragmentShader(bool color_24bit, bool depth_buffer) 1148 { 1149 std::stringstream ss; 1150 WriteHeader(ss); 1151 DefineMacro(ss, "COLOR_24BIT", color_24bit); 1152 DefineMacro(ss, "DEPTH_BUFFER", depth_buffer); 1153 DefineMacro(ss, "MULTISAMPLED", UsingMSAA()); 1154 1155 WriteCommonFunctions(ss); 1156 DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "uint u_skip_x", "uint u_line_skip"}, true); 1157 DeclareTexture(ss, "samp0", 0, UsingMSAA()); 1158 if (depth_buffer) 1159 DeclareTexture(ss, "samp1", 1, UsingMSAA()); 1160 1161 ss << R"( 1162 float4 LoadVRAM(int2 coords) 1163 { 1164 #if MULTISAMPLING 1165 float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u); 1166 FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++) 1167 value += LOAD_TEXTURE_MS(samp0, coords, sample_index); 1168 value /= float(MULTISAMPLES); 1169 return value; 1170 #else 1171 return LOAD_TEXTURE(samp0, coords, 0); 1172 #endif 1173 } 1174 1175 #if DEPTH_BUFFER 1176 float LoadDepth(int2 coords) 1177 { 1178 // Need to duplicate because different types in different languages... 1179 #if MULTISAMPLING 1180 float value = LOAD_TEXTURE_MS(samp1, coords, 0u).r; 1181 FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++) 1182 value += LOAD_TEXTURE_MS(samp1, coords, sample_index).r; 1183 value /= float(MULTISAMPLES); 1184 return value; 1185 #else 1186 return LOAD_TEXTURE(samp1, coords, 0).r; 1187 #endif 1188 } 1189 #endif 1190 1191 float3 SampleVRAM24(uint2 icoords) 1192 { 1193 // load adjacent 16-bit texels 1194 uint2 clamp_size = uint2(1024, 512); 1195 1196 // relative to start of scanout 1197 uint2 vram_coords = u_vram_offset + uint2((icoords.x * 3u) / 2u, icoords.y); 1198 uint s0 = RGBA8ToRGBA5551(LoadVRAM(int2((vram_coords % clamp_size) * RESOLUTION_SCALE))); 1199 uint s1 = RGBA8ToRGBA5551(LoadVRAM(int2(((vram_coords + uint2(1, 0)) % clamp_size) * RESOLUTION_SCALE))); 1200 1201 // select which part of the combined 16-bit texels we are currently shading 1202 uint s1s0 = ((s1 << 16) | s0) >> ((icoords.x & 1u) * 8u); 1203 1204 // extract components and normalize 1205 return float3(float(s1s0 & 0xFFu) / 255.0, float((s1s0 >> 8u) & 0xFFu) / 255.0, 1206 float((s1s0 >> 16u) & 0xFFu) / 255.0); 1207 } 1208 )"; 1209 1210 DeclareFragmentEntryPoint(ss, 0, 1, {}, true, depth_buffer ? 2 : 1); 1211 ss << R"( 1212 { 1213 uint2 icoords = uint2(uint(v_pos.x) + u_skip_x, uint(v_pos.y) << u_line_skip); 1214 int2 wrapped_coords = int2((icoords + u_vram_offset) % VRAM_SIZE); 1215 1216 #if COLOR_24BIT 1217 o_col0 = float4(SampleVRAM24(icoords), 1.0); 1218 #else 1219 o_col0 = float4(LoadVRAM(wrapped_coords).rgb, 1.0); 1220 #endif 1221 1222 #if DEPTH_BUFFER 1223 o_col1 = float4(LoadDepth(wrapped_coords), 0.0, 0.0, 0.0); 1224 #endif 1225 } 1226 )"; 1227 1228 return ss.str(); 1229 } 1230 1231 std::string GPU_HW_ShaderGen::GenerateWireframeGeometryShader() 1232 { 1233 std::stringstream ss; 1234 WriteHeader(ss); 1235 WriteCommonFunctions(ss); 1236 1237 if (m_glsl) 1238 { 1239 ss << R"( 1240 layout(triangles) in; 1241 layout(line_strip, max_vertices = 6) out; 1242 1243 void main() 1244 { 1245 gl_Position = gl_in[0].gl_Position; 1246 EmitVertex(); 1247 gl_Position = gl_in[1].gl_Position; 1248 EmitVertex(); 1249 EndPrimitive(); 1250 gl_Position = gl_in[1].gl_Position; 1251 EmitVertex(); 1252 gl_Position = gl_in[2].gl_Position; 1253 EmitVertex(); 1254 EndPrimitive(); 1255 gl_Position = gl_in[2].gl_Position; 1256 EmitVertex(); 1257 gl_Position = gl_in[0].gl_Position; 1258 EmitVertex(); 1259 EndPrimitive(); 1260 } 1261 )"; 1262 } 1263 else 1264 { 1265 ss << R"( 1266 struct GSInput 1267 { 1268 float4 col0 : COLOR0; 1269 float4 pos : SV_Position; 1270 }; 1271 1272 struct GSOutput 1273 { 1274 float4 pos : SV_Position; 1275 }; 1276 1277 GSOutput GetVertex(GSInput vi) 1278 { 1279 GSOutput vo; 1280 vo.pos = vi.pos; 1281 return vo; 1282 } 1283 1284 [maxvertexcount(6)] 1285 void main(triangle GSInput input[3], inout LineStream<GSOutput> output) 1286 { 1287 output.Append(GetVertex(input[0])); 1288 output.Append(GetVertex(input[1])); 1289 output.RestartStrip(); 1290 1291 output.Append(GetVertex(input[1])); 1292 output.Append(GetVertex(input[2])); 1293 output.RestartStrip(); 1294 1295 output.Append(GetVertex(input[2])); 1296 output.Append(GetVertex(input[0])); 1297 output.RestartStrip(); 1298 } 1299 )"; 1300 } 1301 1302 return ss.str(); 1303 } 1304 1305 std::string GPU_HW_ShaderGen::GenerateWireframeFragmentShader() 1306 { 1307 std::stringstream ss; 1308 WriteHeader(ss); 1309 WriteCommonFunctions(ss); 1310 1311 DeclareFragmentEntryPoint(ss, 0, 0); 1312 ss << R"( 1313 { 1314 o_col0 = float4(1.0, 1.0, 1.0, 0.5); 1315 } 1316 )"; 1317 1318 return ss.str(); 1319 } 1320 1321 std::string GPU_HW_ShaderGen::GenerateVRAMReadFragmentShader() 1322 { 1323 std::stringstream ss; 1324 WriteHeader(ss); 1325 WriteCommonFunctions(ss); 1326 DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_size"}, true); 1327 1328 DeclareTexture(ss, "samp0", 0, UsingMSAA()); 1329 1330 ss << R"( 1331 float4 LoadVRAM(int2 coords) 1332 { 1333 #if MULTISAMPLING 1334 float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u); 1335 FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++) 1336 value += LOAD_TEXTURE_MS(samp0, coords, sample_index); 1337 value /= float(MULTISAMPLES); 1338 return value; 1339 #else 1340 return LOAD_TEXTURE(samp0, coords, 0); 1341 #endif 1342 } 1343 1344 uint SampleVRAM(uint2 coords) 1345 { 1346 if (RESOLUTION_SCALE == 1u) 1347 return RGBA8ToRGBA5551(LoadVRAM(int2(coords))); 1348 1349 // Box filter for downsampling. 1350 float4 value = float4(0.0, 0.0, 0.0, 0.0); 1351 uint2 base_coords = coords * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); 1352 for (uint offset_x = 0u; offset_x < RESOLUTION_SCALE; offset_x++) 1353 { 1354 for (uint offset_y = 0u; offset_y < RESOLUTION_SCALE; offset_y++) 1355 value += LoadVRAM(int2(base_coords + uint2(offset_x, offset_y))); 1356 } 1357 value /= float(RESOLUTION_SCALE * RESOLUTION_SCALE); 1358 return RGBA8ToRGBA5551(value); 1359 } 1360 )"; 1361 1362 DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1); 1363 ss << R"( 1364 { 1365 uint2 sample_coords = uint2(uint(v_pos.x) * 2u, uint(v_pos.y)); 1366 sample_coords += u_base_coords; 1367 1368 // We're encoding as 32-bit, so the output width is halved and we pack two 16-bit pixels in one 32-bit pixel. 1369 uint left = SampleVRAM(sample_coords); 1370 uint right = SampleVRAM(uint2(sample_coords.x + 1u, sample_coords.y)); 1371 1372 o_col0 = float4(float(left & 0xFFu), float((left >> 8) & 0xFFu), 1373 float(right & 0xFFu), float((right >> 8) & 0xFFu)) 1374 / float4(255.0, 255.0, 255.0, 255.0); 1375 })"; 1376 1377 return ss.str(); 1378 } 1379 1380 std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_buffer, bool use_ssbo) 1381 { 1382 std::stringstream ss; 1383 WriteHeader(ss); 1384 WriteCommonFunctions(ss); 1385 DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); 1386 DefineMacro(ss, "USE_BUFFER", use_buffer); 1387 DeclareUniformBuffer(ss, 1388 {"uint2 u_base_coords", "uint2 u_end_coords", "uint2 u_size", "uint u_buffer_base_offset", 1389 "uint u_mask_or_bits", "float u_depth_value"}, 1390 true); 1391 1392 if (!use_buffer) 1393 { 1394 DeclareTexture(ss, "samp0", 0, false, true, true); 1395 } 1396 else if (use_ssbo && m_glsl) 1397 { 1398 ss << "layout(std430"; 1399 if (IsVulkan()) 1400 ss << ", set = 0, binding = 0"; 1401 else if (IsMetal()) 1402 ss << ", set = 0, binding = 1"; 1403 else if (m_use_glsl_binding_layout) 1404 ss << ", binding = 0"; 1405 1406 ss << ") readonly restrict buffer SSBO {\n"; 1407 ss << " uint ssbo_data[];\n"; 1408 ss << "};\n\n"; 1409 1410 ss << "#define GET_VALUE(buffer_offset) (ssbo_data[(buffer_offset) / 2u] >> (((buffer_offset) % 2u) * 16u))\n\n"; 1411 } 1412 else 1413 { 1414 DeclareTextureBuffer(ss, "samp0", 0, true, true); 1415 ss << "#define GET_VALUE(buffer_offset) (LOAD_TEXTURE_BUFFER(samp0, int(buffer_offset)).r)\n\n"; 1416 } 1417 1418 DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, m_write_mask_as_depth); 1419 ss << R"( 1420 { 1421 uint2 coords = uint2(v_pos.xy) / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); 1422 1423 // make sure it's not oversized and out of range 1424 if ((coords.x < u_base_coords.x && coords.x >= u_end_coords.x) || 1425 (coords.y < u_base_coords.y && coords.y >= u_end_coords.y)) 1426 { 1427 discard; 1428 } 1429 1430 // find offset from the start of the row/column 1431 uint2 offset; 1432 offset.x = (coords.x < u_base_coords.x) ? ((VRAM_SIZE.x / RESOLUTION_SCALE) - u_base_coords.x + coords.x) : (coords.x - u_base_coords.x); 1433 offset.y = (coords.y < u_base_coords.y) ? ((VRAM_SIZE.y / RESOLUTION_SCALE) - u_base_coords.y + coords.y) : (coords.y - u_base_coords.y); 1434 1435 #if !USE_BUFFER 1436 uint value = LOAD_TEXTURE(samp0, int2(offset), 0).x; 1437 #else 1438 uint buffer_offset = u_buffer_base_offset + (offset.y * u_size.x) + offset.x; 1439 uint value = GET_VALUE(buffer_offset) | u_mask_or_bits; 1440 #endif 1441 1442 o_col0 = RGBA5551ToRGBA8(value); 1443 #if WRITE_MASK_AS_DEPTH 1444 o_depth = (o_col0.a == 1.0) ? u_depth_value : 0.0; 1445 #endif 1446 })"; 1447 1448 return ss.str(); 1449 } 1450 1451 std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader() 1452 { 1453 // TODO: This won't currently work because we can't bind the texture to both the shader and framebuffer. 1454 const bool msaa = false; 1455 1456 std::stringstream ss; 1457 WriteHeader(ss); 1458 WriteCommonFunctions(ss); 1459 DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); 1460 DeclareUniformBuffer(ss, 1461 {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_end_coords", "uint2 u_size", 1462 "bool u_set_mask_bit", "float u_depth_value"}, 1463 true); 1464 1465 DeclareTexture(ss, "samp0", 0, msaa); 1466 DefineMacro(ss, "MSAA_COPY", msaa); 1467 DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, m_write_mask_as_depth, false, false, msaa); 1468 ss << R"( 1469 { 1470 uint2 dst_coords = uint2(v_pos.xy); 1471 1472 // make sure it's not oversized and out of range 1473 if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) || 1474 (dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y)) 1475 { 1476 discard; 1477 } 1478 1479 // find offset from the start of the row/column 1480 uint2 offset; 1481 offset.x = (dst_coords.x < u_dst_coords.x) ? (VRAM_SIZE.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x); 1482 offset.y = (dst_coords.y < u_dst_coords.y) ? (VRAM_SIZE.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y); 1483 1484 // find the source coordinates to copy from 1485 uint2 src_coords = (u_src_coords + offset) % VRAM_SIZE; 1486 1487 // sample and apply mask bit 1488 #if MSAA_COPY 1489 float4 color = LOAD_TEXTURE_MS(samp0, int2(src_coords), f_sample_index); 1490 #else 1491 float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0); 1492 #endif 1493 o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a); 1494 #if WRITE_MASK_AS_DEPTH 1495 o_depth = (u_set_mask_bit ? 1.0f : ((o_col0.a == 1.0) ? u_depth_value : 0.0)); 1496 #endif 1497 })"; 1498 1499 return ss.str(); 1500 } 1501 1502 std::string GPU_HW_ShaderGen::GenerateVRAMFillFragmentShader(bool wrapped, bool interlaced) 1503 { 1504 std::stringstream ss; 1505 WriteHeader(ss); 1506 WriteCommonFunctions(ss); 1507 DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); 1508 DefineMacro(ss, "WRAPPED", wrapped); 1509 DefineMacro(ss, "INTERLACED", interlaced); 1510 1511 DeclareUniformBuffer( 1512 ss, {"uint2 u_dst_coords", "uint2 u_end_coords", "float4 u_fill_color", "uint u_interlaced_displayed_field"}, true); 1513 1514 DeclareFragmentEntryPoint(ss, 0, 1, {}, interlaced || wrapped, 1, false, m_write_mask_as_depth, false, false, false); 1515 ss << R"( 1516 { 1517 #if INTERLACED || WRAPPED 1518 uint2 dst_coords = uint2(v_pos.xy); 1519 #endif 1520 1521 #if INTERLACED 1522 if ((dst_coords.y & 1u) == u_interlaced_displayed_field) 1523 discard; 1524 #endif 1525 1526 #if WRAPPED 1527 // make sure it's not oversized and out of range 1528 if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) || 1529 (dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y)) 1530 { 1531 discard; 1532 } 1533 #endif 1534 1535 o_col0 = u_fill_color; 1536 #if WRITE_MASK_AS_DEPTH 1537 o_depth = u_fill_color.a; 1538 #endif 1539 })"; 1540 1541 return ss.str(); 1542 } 1543 1544 std::string GPU_HW_ShaderGen::GenerateVRAMUpdateDepthFragmentShader() 1545 { 1546 std::stringstream ss; 1547 WriteHeader(ss); 1548 WriteCommonFunctions(ss); 1549 DeclareTexture(ss, "samp0", 0, UsingMSAA()); 1550 DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 0, false, true, false, false, UsingMSAA()); 1551 1552 ss << R"( 1553 { 1554 #if MULTISAMPLING 1555 o_depth = LOAD_TEXTURE_MS(samp0, int2(v_pos.xy), f_sample_index).a; 1556 #else 1557 o_depth = LOAD_TEXTURE(samp0, int2(v_pos.xy), 0).a; 1558 #endif 1559 } 1560 )"; 1561 1562 return ss.str(); 1563 } 1564 1565 void GPU_HW_ShaderGen::WriteAdaptiveDownsampleUniformBuffer(std::stringstream& ss) 1566 { 1567 DeclareUniformBuffer(ss, {"float2 u_uv_min", "float2 u_uv_max", "float2 u_rcp_resolution", "float u_lod"}, true); 1568 } 1569 1570 std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleVertexShader() 1571 { 1572 std::stringstream ss; 1573 WriteHeader(ss); 1574 WriteAdaptiveDownsampleUniformBuffer(ss); 1575 DeclareVertexEntryPoint(ss, {}, 0, 1, {}, true); 1576 ss << R"( 1577 { 1578 v_tex0 = float2(float((v_id << 1) & 2u), float(v_id & 2u)); 1579 v_pos = float4(v_tex0 * float2(2.0f, -2.0f) + float2(-1.0f, 1.0f), 0.0f, 1.0f); 1580 v_tex0 = u_uv_min + (u_uv_max - u_uv_min) * v_tex0; 1581 #if API_OPENGL || API_OPENGL_ES || API_VULKAN 1582 v_pos.y = -v_pos.y; 1583 #endif 1584 } 1585 )"; 1586 return ss.str(); 1587 } 1588 1589 std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleMipFragmentShader(bool first_pass) 1590 { 1591 std::stringstream ss; 1592 WriteHeader(ss); 1593 WriteCommonFunctions(ss); 1594 WriteAdaptiveDownsampleUniformBuffer(ss); 1595 DeclareTexture(ss, "samp0", 0, false); 1596 DefineMacro(ss, "FIRST_PASS", first_pass); 1597 1598 // mipmap_energy.glsl ported from parallel-rsx. 1599 ss << R"( 1600 1601 float4 get_bias(float3 c00, float3 c01, float3 c10, float3 c11) 1602 { 1603 // Measure the "energy" (variance) in the pixels. 1604 // If the pixels are all the same (2D content), use maximum bias, otherwise, taper off quickly back to 0 (edges) 1605 float3 avg = 0.25 * (c00 + c01 + c10 + c11); 1606 float s00 = dot(c00 - avg, c00 - avg); 1607 float s01 = dot(c01 - avg, c01 - avg); 1608 float s10 = dot(c10 - avg, c10 - avg); 1609 float s11 = dot(c11 - avg, c11 - avg); 1610 return float4(avg, 1.0 - log2(1000.0 * (s00 + s01 + s10 + s11) + 1.0)); 1611 } 1612 1613 float4 get_bias(float4 c00, float4 c01, float4 c10, float4 c11) 1614 { 1615 // Measure the "energy" (variance) in the pixels. 1616 // If the pixels are all the same (2D content), use maximum bias, otherwise, taper off quickly back to 0 (edges) 1617 float avg = 0.25 * (c00.a + c01.a + c10.a + c11.a); 1618 float4 bias = get_bias(c00.rgb, c01.rgb, c10.rgb, c11.rgb); 1619 bias.a *= avg; 1620 return bias; 1621 } 1622 1623 )"; 1624 1625 DeclareFragmentEntryPoint(ss, 0, 1); 1626 ss << R"( 1627 { 1628 float2 uv = v_tex0 - (u_rcp_resolution * 0.25); 1629 #ifdef FIRST_PASS 1630 vec3 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0)).rgb; 1631 vec3 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1)).rgb; 1632 vec3 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0)).rgb; 1633 vec3 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1)).rgb; 1634 o_col0 = get_bias(c00, c01, c10, c11); 1635 #else 1636 vec4 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0)); 1637 vec4 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1)); 1638 vec4 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0)); 1639 vec4 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1)); 1640 o_col0 = get_bias(c00, c01, c10, c11); 1641 #endif 1642 } 1643 )"; 1644 1645 return ss.str(); 1646 } 1647 1648 std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleBlurFragmentShader() 1649 { 1650 std::stringstream ss; 1651 WriteHeader(ss); 1652 WriteCommonFunctions(ss); 1653 WriteAdaptiveDownsampleUniformBuffer(ss); 1654 DeclareTexture(ss, "samp0", 0, false); 1655 1656 // mipmap_blur.glsl ported from parallel-rsx. 1657 DeclareFragmentEntryPoint(ss, 0, 1); 1658 ss << R"( 1659 { 1660 float bias = 0.0; 1661 const float w0 = 0.25; 1662 const float w1 = 0.125; 1663 const float w2 = 0.0625; 1664 #define UV(x, y) clamp((v_tex0 + float2(x, y) * u_rcp_resolution), u_uv_min, u_uv_max) 1665 bias += w2 * SAMPLE_TEXTURE(samp0, UV(-1.0, -1.0)).a; 1666 bias += w2 * SAMPLE_TEXTURE(samp0, UV(+1.0, -1.0)).a; 1667 bias += w2 * SAMPLE_TEXTURE(samp0, UV(-1.0, +1.0)).a; 1668 bias += w2 * SAMPLE_TEXTURE(samp0, UV(+1.0, +1.0)).a; 1669 bias += w1 * SAMPLE_TEXTURE(samp0, UV( 0.0, -1.0)).a; 1670 bias += w1 * SAMPLE_TEXTURE(samp0, UV(-1.0, 0.0)).a; 1671 bias += w1 * SAMPLE_TEXTURE(samp0, UV(+1.0, 0.0)).a; 1672 bias += w1 * SAMPLE_TEXTURE(samp0, UV( 0.0, +1.0)).a; 1673 bias += w0 * SAMPLE_TEXTURE(samp0, UV( 0.0, 0.0)).a; 1674 o_col0 = float4(bias, bias, bias, bias); 1675 } 1676 )"; 1677 1678 return ss.str(); 1679 } 1680 1681 std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleCompositeFragmentShader() 1682 { 1683 std::stringstream ss; 1684 WriteHeader(ss); 1685 WriteCommonFunctions(ss); 1686 DeclareTexture(ss, "samp0", 0, false); 1687 DeclareTexture(ss, "samp1", 1, false); 1688 1689 // mipmap_resolve.glsl ported from parallel-rsx. 1690 DeclareFragmentEntryPoint(ss, 0, 1, {}, true); 1691 ss << R"( 1692 { 1693 float bias = SAMPLE_TEXTURE(samp1, v_tex0).r; 1694 float mip = float(RESOLUTION_SCALE - 1u) * bias; 1695 float3 color = SAMPLE_TEXTURE_LEVEL(samp0, v_tex0, mip).rgb; 1696 o_col0 = float4(color, 1.0); 1697 } 1698 )"; 1699 1700 return ss.str(); 1701 } 1702 1703 std::string GPU_HW_ShaderGen::GenerateBoxSampleDownsampleFragmentShader(u32 factor) 1704 { 1705 std::stringstream ss; 1706 WriteHeader(ss); 1707 WriteCommonFunctions(ss); 1708 DeclareUniformBuffer(ss, {"uint2 u_base_coords"}, true); 1709 DeclareTexture(ss, "samp0", 0, false); 1710 1711 ss << "#define FACTOR " << factor << "\n"; 1712 1713 DeclareFragmentEntryPoint(ss, 0, 1, {}, true); 1714 ss << R"( 1715 { 1716 float3 color = float3(0.0, 0.0, 0.0); 1717 uint2 base_coords = u_base_coords + uint2(v_pos.xy) * uint2(FACTOR, FACTOR); 1718 for (uint offset_x = 0u; offset_x < FACTOR; offset_x++) 1719 { 1720 for (uint offset_y = 0u; offset_y < FACTOR; offset_y++) 1721 color += LOAD_TEXTURE(samp0, int2(base_coords + uint2(offset_x, offset_y)), 0).rgb; 1722 } 1723 color /= float(FACTOR * FACTOR); 1724 o_col0 = float4(color, 1.0); 1725 } 1726 )"; 1727 1728 return ss.str(); 1729 }