duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

gpu_sw_backend.cpp (33350B)


      1 // SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
      2 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
      3 
      4 #include "gpu_sw_backend.h"
      5 #include "gpu.h"
      6 #include "system.h"
      7 
      8 #include "util/gpu_device.h"
      9 
     10 #include <algorithm>
     11 
     12 GPU_SW_Backend::GPU_SW_Backend() = default;
     13 
     14 GPU_SW_Backend::~GPU_SW_Backend() = default;
     15 
     16 bool GPU_SW_Backend::Initialize(bool force_thread)
     17 {
     18   return GPUBackend::Initialize(force_thread);
     19 }
     20 
     21 void GPU_SW_Backend::Reset()
     22 {
     23   GPUBackend::Reset();
     24 }
     25 
     26 void GPU_SW_Backend::DrawPolygon(const GPUBackendDrawPolygonCommand* cmd)
     27 {
     28   const GPURenderCommand rc{cmd->rc.bits};
     29   const bool dithering_enable = rc.IsDitheringEnabled() && cmd->draw_mode.dither_enable;
     30 
     31   const DrawTriangleFunction DrawFunction = GetDrawTriangleFunction(
     32     rc.shading_enable, rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable, dithering_enable);
     33 
     34   (this->*DrawFunction)(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]);
     35   if (rc.quad_polygon)
     36     (this->*DrawFunction)(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]);
     37 }
     38 
     39 void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
     40 {
     41   const GPURenderCommand rc{cmd->rc.bits};
     42 
     43   const DrawRectangleFunction DrawFunction =
     44     GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
     45 
     46   (this->*DrawFunction)(cmd);
     47 }
     48 
     49 void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd)
     50 {
     51   const DrawLineFunction DrawFunction =
     52     GetDrawLineFunction(cmd->rc.shading_enable, cmd->rc.transparency_enable, cmd->IsDitheringEnabled());
     53 
     54   for (u16 i = 1; i < cmd->num_vertices; i++)
     55     (this->*DrawFunction)(cmd, &cmd->vertices[i - 1], &cmd->vertices[i]);
     56 }
     57 
     58 constexpr GPU_SW_Backend::DitherLUT GPU_SW_Backend::ComputeDitherLUT()
     59 {
     60   DitherLUT lut = {};
     61   for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
     62   {
     63     for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++)
     64     {
     65       for (u32 value = 0; value < DITHER_LUT_SIZE; value++)
     66       {
     67         const s32 dithered_value = (static_cast<s32>(value) + DITHER_MATRIX[i][j]) >> 3;
     68         lut[i][j][value] = static_cast<u8>((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value));
     69       }
     70     }
     71   }
     72   return lut;
     73 }
     74 
     75 static constexpr GPU_SW_Backend::DitherLUT s_dither_lut = GPU_SW_Backend::ComputeDitherLUT();
     76 
     77 template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
     78 void ALWAYS_INLINE_RELEASE GPU_SW_Backend::ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r,
     79                                                       u8 color_g, u8 color_b, u8 texcoord_x, u8 texcoord_y)
     80 {
     81   VRAMPixel color;
     82   if constexpr (texture_enable)
     83   {
     84     // Apply texture window
     85     texcoord_x = (texcoord_x & cmd->window.and_x) | cmd->window.or_x;
     86     texcoord_y = (texcoord_y & cmd->window.and_y) | cmd->window.or_y;
     87 
     88     VRAMPixel texture_color;
     89     switch (cmd->draw_mode.texture_mode)
     90     {
     91       case GPUTextureMode::Palette4Bit:
     92       {
     93         const u16 palette_value =
     94           GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH,
     95                    (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
     96         const size_t palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu;
     97         texture_color.bits = g_gpu_clut[palette_index];
     98       }
     99       break;
    100 
    101       case GPUTextureMode::Palette8Bit:
    102       {
    103         const u16 palette_value =
    104           GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH,
    105                    (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
    106         const size_t palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu;
    107         texture_color.bits = g_gpu_clut[palette_index];
    108       }
    109       break;
    110 
    111       default:
    112       {
    113         texture_color.bits = GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x)) % VRAM_WIDTH,
    114                                       (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
    115       }
    116       break;
    117     }
    118 
    119     if (texture_color.bits == 0)
    120       return;
    121 
    122     if constexpr (raw_texture_enable)
    123     {
    124       color.bits = texture_color.bits;
    125     }
    126     else
    127     {
    128       const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
    129       const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
    130 
    131       color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.r) * u16(color_r)) >> 4]) << 0) |
    132                    (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.g) * u16(color_g)) >> 4]) << 5) |
    133                    (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.b) * u16(color_b)) >> 4]) << 10) |
    134                    (texture_color.bits & 0x8000u);
    135     }
    136   }
    137   else
    138   {
    139     const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
    140     const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
    141 
    142     // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
    143     color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_r]) << 0) |
    144                  (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_g]) << 5) |
    145                  (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_b]) << 10) | (transparency_enable ? 0x8000u : 0);
    146   }
    147 
    148   const VRAMPixel bg_color{GetPixel(static_cast<u32>(x), static_cast<u32>(y))};
    149   if constexpr (transparency_enable)
    150   {
    151     if (color.bits & 0x8000u || !texture_enable)
    152     {
    153       // Based on blargg's efficient 15bpp pixel math.
    154       u32 bg_bits = ZeroExtend32(bg_color.bits);
    155       u32 fg_bits = ZeroExtend32(color.bits);
    156       switch (cmd->draw_mode.transparency_mode)
    157       {
    158         case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
    159         {
    160           bg_bits |= 0x8000u;
    161           color.bits = Truncate16(((fg_bits + bg_bits) - ((fg_bits ^ bg_bits) & 0x0421u)) >> 1);
    162         }
    163         break;
    164 
    165         case GPUTransparencyMode::BackgroundPlusForeground:
    166         {
    167           bg_bits &= ~0x8000u;
    168 
    169           const u32 sum = fg_bits + bg_bits;
    170           const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;
    171 
    172           color.bits = Truncate16((sum - carry) | (carry - (carry >> 5)));
    173         }
    174         break;
    175 
    176         case GPUTransparencyMode::BackgroundMinusForeground:
    177         {
    178           bg_bits |= 0x8000u;
    179           fg_bits &= ~0x8000u;
    180 
    181           const u32 diff = bg_bits - fg_bits + 0x108420u;
    182           const u32 borrow = (diff - ((bg_bits ^ fg_bits) & 0x108420u)) & 0x108420u;
    183 
    184           color.bits = Truncate16((diff - borrow) & (borrow - (borrow >> 5)));
    185         }
    186         break;
    187 
    188         case GPUTransparencyMode::BackgroundPlusQuarterForeground:
    189         {
    190           bg_bits &= ~0x8000u;
    191           fg_bits = ((fg_bits >> 2) & 0x1CE7u) | 0x8000u;
    192 
    193           const u32 sum = fg_bits + bg_bits;
    194           const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;
    195 
    196           color.bits = Truncate16((sum - carry) | (carry - (carry >> 5)));
    197         }
    198         break;
    199 
    200         default:
    201           break;
    202       }
    203 
    204       // See above.
    205       if constexpr (!texture_enable)
    206         color.bits &= ~0x8000u;
    207     }
    208   }
    209 
    210   const u16 mask_and = cmd->params.GetMaskAND();
    211   if ((bg_color.bits & mask_and) != 0)
    212     return;
    213 
    214   DebugAssert(static_cast<u32>(x) < VRAM_WIDTH && static_cast<u32>(y) < VRAM_HEIGHT);
    215   SetPixel(static_cast<u32>(x), static_cast<u32>(y), color.bits | cmd->params.GetMaskOR());
    216 }
    217 
    218 template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
    219 void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
    220 {
    221   const s32 origin_x = cmd->x;
    222   const s32 origin_y = cmd->y;
    223   const auto [r, g, b] = UnpackColorRGB24(cmd->color);
    224   const auto [origin_texcoord_x, origin_texcoord_y] = UnpackTexcoord(cmd->texcoord);
    225 
    226   for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
    227   {
    228     const s32 y = origin_y + static_cast<s32>(offset_y);
    229     if (y < static_cast<s32>(m_drawing_area.top) || y > static_cast<s32>(m_drawing_area.bottom) ||
    230         (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u)))
    231     {
    232       continue;
    233     }
    234 
    235     const u32 draw_y = static_cast<u32>(y) & VRAM_HEIGHT_MASK;
    236     const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y);
    237 
    238     for (u32 offset_x = 0; offset_x < cmd->width; offset_x++)
    239     {
    240       const s32 x = origin_x + static_cast<s32>(offset_x);
    241       if (x < static_cast<s32>(m_drawing_area.left) || x > static_cast<s32>(m_drawing_area.right))
    242         continue;
    243 
    244       const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x);
    245 
    246       ShadePixel<texture_enable, raw_texture_enable, transparency_enable, false>(cmd, static_cast<u32>(x), draw_y, r, g,
    247                                                                                  b, texcoord_x, texcoord_y);
    248     }
    249   }
    250 }
    251 
    252 //////////////////////////////////////////////////////////////////////////
    253 // Polygon and line rasterization ported from Mednafen
    254 //////////////////////////////////////////////////////////////////////////
    255 
    256 #define COORD_FBS 12
    257 #define COORD_MF_INT(n) ((n) << COORD_FBS)
    258 #define COORD_POST_PADDING 12
    259 
    260 static ALWAYS_INLINE_RELEASE s64 MakePolyXFP(s32 x)
    261 {
    262   return ((u64)x << 32) + ((1ULL << 32) - (1 << 11));
    263 }
    264 
    265 static ALWAYS_INLINE_RELEASE s64 MakePolyXFPStep(s32 dx, s32 dy)
    266 {
    267   s64 ret;
    268   s64 dx_ex = (u64)dx << 32;
    269 
    270   if (dx_ex < 0)
    271     dx_ex -= dy - 1;
    272 
    273   if (dx_ex > 0)
    274     dx_ex += dy - 1;
    275 
    276   ret = dx_ex / dy;
    277 
    278   return (ret);
    279 }
    280 
    281 static ALWAYS_INLINE_RELEASE s32 GetPolyXFP_Int(s64 xfp)
    282 {
    283   return (xfp >> 32);
    284 }
    285 
    286 template<bool shading_enable, bool texture_enable>
    287 bool ALWAYS_INLINE_RELEASE GPU_SW_Backend::CalcIDeltas(i_deltas& idl, const GPUBackendDrawPolygonCommand::Vertex* A,
    288                                                        const GPUBackendDrawPolygonCommand::Vertex* B,
    289                                                        const GPUBackendDrawPolygonCommand::Vertex* C)
    290 {
    291 #define CALCIS(x, y) (((B->x - A->x) * (C->y - B->y)) - ((C->x - B->x) * (B->y - A->y)))
    292 
    293   s32 denom = CALCIS(x, y);
    294 
    295   if (!denom)
    296     return false;
    297 
    298   if constexpr (shading_enable)
    299   {
    300     idl.dr_dx = (u32)(CALCIS(r, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    301     idl.dr_dy = (u32)(CALCIS(x, r) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    302 
    303     idl.dg_dx = (u32)(CALCIS(g, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    304     idl.dg_dy = (u32)(CALCIS(x, g) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    305 
    306     idl.db_dx = (u32)(CALCIS(b, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    307     idl.db_dy = (u32)(CALCIS(x, b) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    308   }
    309 
    310   if constexpr (texture_enable)
    311   {
    312     idl.du_dx = (u32)(CALCIS(u, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    313     idl.du_dy = (u32)(CALCIS(x, u) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    314 
    315     idl.dv_dx = (u32)(CALCIS(v, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    316     idl.dv_dy = (u32)(CALCIS(x, v) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
    317   }
    318 
    319   return true;
    320 
    321 #undef CALCIS
    322 }
    323 
    324 template<bool shading_enable, bool texture_enable>
    325 void ALWAYS_INLINE_RELEASE GPU_SW_Backend::AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count /*= 1*/)
    326 {
    327   if constexpr (shading_enable)
    328   {
    329     ig.r += idl.dr_dx * count;
    330     ig.g += idl.dg_dx * count;
    331     ig.b += idl.db_dx * count;
    332   }
    333 
    334   if constexpr (texture_enable)
    335   {
    336     ig.u += idl.du_dx * count;
    337     ig.v += idl.dv_dx * count;
    338   }
    339 }
    340 
    341 template<bool shading_enable, bool texture_enable>
    342 void ALWAYS_INLINE_RELEASE GPU_SW_Backend::AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count /*= 1*/)
    343 {
    344   if constexpr (shading_enable)
    345   {
    346     ig.r += idl.dr_dy * count;
    347     ig.g += idl.dg_dy * count;
    348     ig.b += idl.db_dy * count;
    349   }
    350 
    351   if constexpr (texture_enable)
    352   {
    353     ig.u += idl.du_dy * count;
    354     ig.v += idl.dv_dy * count;
    355   }
    356 }
    357 
    358 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
    359          bool dithering_enable>
    360 void GPU_SW_Backend::DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, i_group ig,
    361                               const i_deltas& idl)
    362 {
    363   if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u))
    364     return;
    365 
    366   s32 x_ig_adjust = x_start;
    367   s32 w = x_bound - x_start;
    368   s32 x = TruncateGPUVertexPosition(x_start);
    369 
    370   if (x < static_cast<s32>(m_drawing_area.left))
    371   {
    372     s32 delta = static_cast<s32>(m_drawing_area.left) - x;
    373     x_ig_adjust += delta;
    374     x += delta;
    375     w -= delta;
    376   }
    377 
    378   if ((x + w) > (static_cast<s32>(m_drawing_area.right) + 1))
    379     w = static_cast<s32>(m_drawing_area.right) + 1 - x;
    380 
    381   if (w <= 0)
    382     return;
    383 
    384   AddIDeltas_DX<shading_enable, texture_enable>(ig, idl, x_ig_adjust);
    385   AddIDeltas_DY<shading_enable, texture_enable>(ig, idl, y);
    386 
    387   do
    388   {
    389     const u32 r = ig.r >> (COORD_FBS + COORD_POST_PADDING);
    390     const u32 g = ig.g >> (COORD_FBS + COORD_POST_PADDING);
    391     const u32 b = ig.b >> (COORD_FBS + COORD_POST_PADDING);
    392     const u32 u = ig.u >> (COORD_FBS + COORD_POST_PADDING);
    393     const u32 v = ig.v >> (COORD_FBS + COORD_POST_PADDING);
    394 
    395     ShadePixel<texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
    396       cmd, static_cast<u32>(x), static_cast<u32>(y), Truncate8(r), Truncate8(g), Truncate8(b), Truncate8(u),
    397       Truncate8(v));
    398 
    399     x++;
    400     AddIDeltas_DX<shading_enable, texture_enable>(ig, idl);
    401   } while (--w > 0);
    402 }
    403 
    404 template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
    405          bool dithering_enable>
    406 void GPU_SW_Backend::DrawTriangle(const GPUBackendDrawPolygonCommand* cmd,
    407                                   const GPUBackendDrawPolygonCommand::Vertex* v0,
    408                                   const GPUBackendDrawPolygonCommand::Vertex* v1,
    409                                   const GPUBackendDrawPolygonCommand::Vertex* v2)
    410 {
    411   u32 core_vertex;
    412   {
    413     u32 cvtemp = 0;
    414 
    415     if (v1->x <= v0->x)
    416     {
    417       if (v2->x <= v1->x)
    418         cvtemp = (1 << 2);
    419       else
    420         cvtemp = (1 << 1);
    421     }
    422     else if (v2->x < v0->x)
    423       cvtemp = (1 << 2);
    424     else
    425       cvtemp = (1 << 0);
    426 
    427     if (v2->y < v1->y)
    428     {
    429       std::swap(v2, v1);
    430       cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1);
    431     }
    432 
    433     if (v1->y < v0->y)
    434     {
    435       std::swap(v1, v0);
    436       cvtemp = ((cvtemp >> 1) & 0x1) | ((cvtemp << 1) & 0x2) | (cvtemp & 0x4);
    437     }
    438 
    439     if (v2->y < v1->y)
    440     {
    441       std::swap(v2, v1);
    442       cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1);
    443     }
    444 
    445     core_vertex = cvtemp >> 1;
    446   }
    447 
    448   if (v0->y == v2->y)
    449     return;
    450 
    451   if (static_cast<u32>(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
    452       static_cast<u32>(std::abs(v2->x - v1->x)) >= MAX_PRIMITIVE_WIDTH ||
    453       static_cast<u32>(std::abs(v1->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
    454       static_cast<u32>(v2->y - v0->y) >= MAX_PRIMITIVE_HEIGHT)
    455   {
    456     return;
    457   }
    458 
    459   s64 base_coord = MakePolyXFP(v0->x);
    460   s64 base_step = MakePolyXFPStep((v2->x - v0->x), (v2->y - v0->y));
    461   s64 bound_coord_us;
    462   s64 bound_coord_ls;
    463   bool right_facing;
    464 
    465   if (v1->y == v0->y)
    466   {
    467     bound_coord_us = 0;
    468     right_facing = (bool)(v1->x > v0->x);
    469   }
    470   else
    471   {
    472     bound_coord_us = MakePolyXFPStep((v1->x - v0->x), (v1->y - v0->y));
    473     right_facing = (bool)(bound_coord_us > base_step);
    474   }
    475 
    476   if (v2->y == v1->y)
    477     bound_coord_ls = 0;
    478   else
    479     bound_coord_ls = MakePolyXFPStep((v2->x - v1->x), (v2->y - v1->y));
    480 
    481   i_deltas idl;
    482   if (!CalcIDeltas<shading_enable, texture_enable>(idl, v0, v1, v2))
    483     return;
    484 
    485   const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
    486 
    487   i_group ig;
    488   if constexpr (texture_enable)
    489   {
    490     ig.u = (COORD_MF_INT(vertices[core_vertex]->u) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
    491     ig.v = (COORD_MF_INT(vertices[core_vertex]->v) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
    492   }
    493 
    494   ig.r = (COORD_MF_INT(vertices[core_vertex]->r) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
    495   ig.g = (COORD_MF_INT(vertices[core_vertex]->g) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
    496   ig.b = (COORD_MF_INT(vertices[core_vertex]->b) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
    497 
    498   AddIDeltas_DX<shading_enable, texture_enable>(ig, idl, -vertices[core_vertex]->x);
    499   AddIDeltas_DY<shading_enable, texture_enable>(ig, idl, -vertices[core_vertex]->y);
    500 
    501   struct TriangleHalf
    502   {
    503     u64 x_coord[2];
    504     u64 x_step[2];
    505 
    506     s32 y_coord;
    507     s32 y_bound;
    508 
    509     bool dec_mode;
    510   } tripart[2];
    511 
    512   u32 vo = 0;
    513   u32 vp = 0;
    514   if (core_vertex != 0)
    515     vo = 1;
    516   if (core_vertex == 2)
    517     vp = 3;
    518 
    519   {
    520     TriangleHalf* tp = &tripart[vo];
    521     tp->y_coord = vertices[0 ^ vo]->y;
    522     tp->y_bound = vertices[1 ^ vo]->y;
    523     tp->x_coord[right_facing] = MakePolyXFP(vertices[0 ^ vo]->x);
    524     tp->x_step[right_facing] = bound_coord_us;
    525     tp->x_coord[!right_facing] = base_coord + ((vertices[vo]->y - vertices[0]->y) * base_step);
    526     tp->x_step[!right_facing] = base_step;
    527     tp->dec_mode = vo;
    528   }
    529 
    530   {
    531     TriangleHalf* tp = &tripart[vo ^ 1];
    532     tp->y_coord = vertices[1 ^ vp]->y;
    533     tp->y_bound = vertices[2 ^ vp]->y;
    534     tp->x_coord[right_facing] = MakePolyXFP(vertices[1 ^ vp]->x);
    535     tp->x_step[right_facing] = bound_coord_ls;
    536     tp->x_coord[!right_facing] =
    537       base_coord + ((vertices[1 ^ vp]->y - vertices[0]->y) *
    538                     base_step); // base_coord + ((vertices[1].y - vertices[0].y) * base_step);
    539     tp->x_step[!right_facing] = base_step;
    540     tp->dec_mode = vp;
    541   }
    542 
    543   for (u32 i = 0; i < 2; i++)
    544   {
    545     s32 yi = tripart[i].y_coord;
    546     s32 yb = tripart[i].y_bound;
    547 
    548     u64 lc = tripart[i].x_coord[0];
    549     u64 ls = tripart[i].x_step[0];
    550 
    551     u64 rc = tripart[i].x_coord[1];
    552     u64 rs = tripart[i].x_step[1];
    553 
    554     if (tripart[i].dec_mode)
    555     {
    556       while (yi > yb)
    557       {
    558         yi--;
    559         lc -= ls;
    560         rc -= rs;
    561 
    562         s32 y = TruncateGPUVertexPosition(yi);
    563 
    564         if (y < static_cast<s32>(m_drawing_area.top))
    565           break;
    566 
    567         if (y > static_cast<s32>(m_drawing_area.bottom))
    568           continue;
    569 
    570         DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
    571           cmd, y & VRAM_HEIGHT_MASK, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl);
    572       }
    573     }
    574     else
    575     {
    576       while (yi < yb)
    577       {
    578         s32 y = TruncateGPUVertexPosition(yi);
    579 
    580         if (y > static_cast<s32>(m_drawing_area.bottom))
    581           break;
    582 
    583         if (y >= static_cast<s32>(m_drawing_area.top))
    584         {
    585           DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
    586             cmd, y & VRAM_HEIGHT_MASK, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl);
    587         }
    588 
    589         yi++;
    590         lc += ls;
    591         rc += rs;
    592       }
    593     }
    594   }
    595 }
    596 
    597 enum
    598 {
    599   Line_XY_FractBits = 32
    600 };
    601 enum
    602 {
    603   Line_RGB_FractBits = 12
    604 };
    605 
    606 struct line_fxp_coord
    607 {
    608   u64 x, y;
    609   u32 r, g, b;
    610 };
    611 
    612 struct line_fxp_step
    613 {
    614   s64 dx_dk, dy_dk;
    615   s32 dr_dk, dg_dk, db_dk;
    616 };
    617 
    618 static ALWAYS_INLINE_RELEASE s64 LineDivide(s64 delta, s32 dk)
    619 {
    620   delta = (u64)delta << Line_XY_FractBits;
    621 
    622   if (delta < 0)
    623     delta -= dk - 1;
    624   if (delta > 0)
    625     delta += dk - 1;
    626 
    627   return (delta / dk);
    628 }
    629 
    630 template<bool shading_enable, bool transparency_enable, bool dithering_enable>
    631 void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
    632                               const GPUBackendDrawLineCommand::Vertex* p1)
    633 {
    634   const s32 i_dx = std::abs(p1->x - p0->x);
    635   const s32 i_dy = std::abs(p1->y - p0->y);
    636   const s32 k = (i_dx > i_dy) ? i_dx : i_dy;
    637   if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT)
    638     return;
    639 
    640   if (p0->x >= p1->x && k > 0)
    641     std::swap(p0, p1);
    642 
    643   line_fxp_step step;
    644   if (k == 0)
    645   {
    646     step.dx_dk = 0;
    647     step.dy_dk = 0;
    648 
    649     if constexpr (shading_enable)
    650     {
    651       step.dr_dk = 0;
    652       step.dg_dk = 0;
    653       step.db_dk = 0;
    654     }
    655   }
    656   else
    657   {
    658     step.dx_dk = LineDivide(p1->x - p0->x, k);
    659     step.dy_dk = LineDivide(p1->y - p0->y, k);
    660 
    661     if constexpr (shading_enable)
    662     {
    663       step.dr_dk = (s32)((u32)(p1->r - p0->r) << Line_RGB_FractBits) / k;
    664       step.dg_dk = (s32)((u32)(p1->g - p0->g) << Line_RGB_FractBits) / k;
    665       step.db_dk = (s32)((u32)(p1->b - p0->b) << Line_RGB_FractBits) / k;
    666     }
    667   }
    668 
    669   line_fxp_coord cur_point;
    670   cur_point.x = ((u64)p0->x << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1));
    671   cur_point.y = ((u64)p0->y << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1));
    672 
    673   cur_point.x -= 1024;
    674 
    675   if (step.dy_dk < 0)
    676     cur_point.y -= 1024;
    677 
    678   if constexpr (shading_enable)
    679   {
    680     cur_point.r = (p0->r << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
    681     cur_point.g = (p0->g << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
    682     cur_point.b = (p0->b << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
    683   }
    684 
    685   for (s32 i = 0; i <= k; i++)
    686   {
    687     // Sign extension is not necessary here for x and y, due to the maximum values that ClipX1 and ClipY1 can contain.
    688     const s32 x = (cur_point.x >> Line_XY_FractBits) & 2047;
    689     const s32 y = (cur_point.y >> Line_XY_FractBits) & 2047;
    690 
    691     if ((!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast<u32>(y)) & 1u)) &&
    692         x >= static_cast<s32>(m_drawing_area.left) && x <= static_cast<s32>(m_drawing_area.right) &&
    693         y >= static_cast<s32>(m_drawing_area.top) && y <= static_cast<s32>(m_drawing_area.bottom))
    694     {
    695       const u8 r = shading_enable ? static_cast<u8>(cur_point.r >> Line_RGB_FractBits) : p0->r;
    696       const u8 g = shading_enable ? static_cast<u8>(cur_point.g >> Line_RGB_FractBits) : p0->g;
    697       const u8 b = shading_enable ? static_cast<u8>(cur_point.b >> Line_RGB_FractBits) : p0->b;
    698 
    699       ShadePixel<false, false, transparency_enable, dithering_enable>(
    700         cmd, static_cast<u32>(x), static_cast<u32>(y) & VRAM_HEIGHT_MASK, r, g, b, 0, 0);
    701     }
    702 
    703     cur_point.x += step.dx_dk;
    704     cur_point.y += step.dy_dk;
    705 
    706     if constexpr (shading_enable)
    707     {
    708       cur_point.r += step.dr_dk;
    709       cur_point.g += step.dg_dk;
    710       cur_point.b += step.db_dk;
    711     }
    712   }
    713 }
    714 
    715 void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
    716 {
    717   const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
    718   const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
    719   constexpr u32 vector_width = 8;
    720   const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
    721 
    722   if ((x + width) <= VRAM_WIDTH && !params.interlaced_rendering)
    723   {
    724     for (u32 yoffs = 0; yoffs < height; yoffs++)
    725     {
    726       const u32 row = (y + yoffs) % VRAM_HEIGHT;
    727 
    728       u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
    729       u32 xoffs = 0;
    730       for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
    731         GSVector4i::store<false>(row_ptr, fill);
    732       for (; xoffs < width; xoffs++)
    733         *(row_ptr++) = color16;
    734     }
    735   }
    736   else if (params.interlaced_rendering)
    737   {
    738     // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
    739     const u32 active_field = params.active_line_lsb;
    740 
    741     if ((x + width) <= VRAM_WIDTH)
    742     {
    743       for (u32 yoffs = 0; yoffs < height; yoffs++)
    744       {
    745         const u32 row = (y + yoffs) % VRAM_HEIGHT;
    746         if ((row & u32(1)) == active_field)
    747           continue;
    748 
    749         u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
    750         u32 xoffs = 0;
    751         for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
    752           GSVector4i::store<false>(row_ptr, fill);
    753         for (; xoffs < width; xoffs++)
    754           *(row_ptr++) = color16;
    755       }
    756     }
    757     else
    758     {
    759       for (u32 yoffs = 0; yoffs < height; yoffs++)
    760       {
    761         const u32 row = (y + yoffs) % VRAM_HEIGHT;
    762         if ((row & u32(1)) == active_field)
    763           continue;
    764 
    765         u16* row_ptr = &g_vram[row * VRAM_WIDTH];
    766         for (u32 xoffs = 0; xoffs < width; xoffs++)
    767         {
    768           const u32 col = (x + xoffs) % VRAM_WIDTH;
    769           row_ptr[col] = color16;
    770         }
    771       }
    772     }
    773   }
    774   else
    775   {
    776     for (u32 yoffs = 0; yoffs < height; yoffs++)
    777     {
    778       const u32 row = (y + yoffs) % VRAM_HEIGHT;
    779       u16* row_ptr = &g_vram[row * VRAM_WIDTH];
    780       for (u32 xoffs = 0; xoffs < width; xoffs++)
    781       {
    782         const u32 col = (x + xoffs) % VRAM_WIDTH;
    783         row_ptr[col] = color16;
    784       }
    785     }
    786   }
    787 }
    788 
    789 void GPU_SW_Backend::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data,
    790                                 GPUBackendCommandParameters params)
    791 {
    792   // Fast path when the copy is not oversized.
    793   if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !params.IsMaskingEnabled())
    794   {
    795     const u16* src_ptr = static_cast<const u16*>(data);
    796     u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
    797     for (u32 yoffs = 0; yoffs < height; yoffs++)
    798     {
    799       std::copy_n(src_ptr, width, dst_ptr);
    800       src_ptr += width;
    801       dst_ptr += VRAM_WIDTH;
    802     }
    803   }
    804   else
    805   {
    806     // Slow path when we need to handle wrap-around.
    807     const u16* src_ptr = static_cast<const u16*>(data);
    808     const u16 mask_and = params.GetMaskAND();
    809     const u16 mask_or = params.GetMaskOR();
    810 
    811     for (u32 row = 0; row < height;)
    812     {
    813       u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
    814       for (u32 col = 0; col < width;)
    815       {
    816         // TODO: Handle unaligned reads...
    817         u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
    818         if (((*pixel_ptr) & mask_and) == 0)
    819           *pixel_ptr = *(src_ptr++) | mask_or;
    820       }
    821     }
    822   }
    823 }
    824 
    825 void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
    826                               GPUBackendCommandParameters params)
    827 {
    828   // Break up oversized copies. This behavior has not been verified on console.
    829   if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
    830   {
    831     u32 remaining_rows = height;
    832     u32 current_src_y = src_y;
    833     u32 current_dst_y = dst_y;
    834     while (remaining_rows > 0)
    835     {
    836       const u32 rows_to_copy =
    837         std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
    838 
    839       u32 remaining_columns = width;
    840       u32 current_src_x = src_x;
    841       u32 current_dst_x = dst_x;
    842       while (remaining_columns > 0)
    843       {
    844         const u32 columns_to_copy =
    845           std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
    846         CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, params);
    847         current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
    848         current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
    849         remaining_columns -= columns_to_copy;
    850       }
    851 
    852       current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
    853       current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
    854       remaining_rows -= rows_to_copy;
    855     }
    856 
    857     return;
    858   }
    859 
    860   // This doesn't have a fast path, but do we really need one? It's not common.
    861   const u16 mask_and = params.GetMaskAND();
    862   const u16 mask_or = params.GetMaskOR();
    863 
    864   // Copy in reverse when src_x < dst_x, this is verified on console.
    865   if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
    866   {
    867     for (u32 row = 0; row < height; row++)
    868     {
    869       const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
    870       u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
    871 
    872       for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
    873       {
    874         const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
    875         u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
    876         if ((*dst_pixel_ptr & mask_and) == 0)
    877           *dst_pixel_ptr = src_pixel | mask_or;
    878       }
    879     }
    880   }
    881   else
    882   {
    883     for (u32 row = 0; row < height; row++)
    884     {
    885       const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
    886       u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
    887 
    888       for (u32 col = 0; col < width; col++)
    889       {
    890         const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
    891         u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
    892         if ((*dst_pixel_ptr & mask_and) == 0)
    893           *dst_pixel_ptr = src_pixel | mask_or;
    894       }
    895     }
    896   }
    897 }
    898 
    899 void GPU_SW_Backend::FlushRender()
    900 {
    901 }
    902 
    903 void GPU_SW_Backend::DrawingAreaChanged()
    904 {
    905 }
    906 
    907 void GPU_SW_Backend::UpdateCLUT(GPUTexturePaletteReg reg, bool clut_is_8bit)
    908 {
    909   GPU::ReadCLUT(g_gpu_clut, reg, clut_is_8bit);
    910 }
    911 
    912 GPU_SW_Backend::DrawLineFunction GPU_SW_Backend::GetDrawLineFunction(bool shading_enable, bool transparency_enable,
    913                                                                      bool dithering_enable)
    914 {
    915   static constexpr DrawLineFunction funcs[2][2][2] = {
    916     {{&GPU_SW_Backend::DrawLine<false, false, false>, &GPU_SW_Backend::DrawLine<false, false, true>},
    917      {&GPU_SW_Backend::DrawLine<false, true, false>, &GPU_SW_Backend::DrawLine<false, true, true>}},
    918     {{&GPU_SW_Backend::DrawLine<true, false, false>, &GPU_SW_Backend::DrawLine<true, false, true>},
    919      {&GPU_SW_Backend::DrawLine<true, true, false>, &GPU_SW_Backend::DrawLine<true, true, true>}}};
    920 
    921   return funcs[u8(shading_enable)][u8(transparency_enable)][u8(dithering_enable)];
    922 }
    923 
    924 GPU_SW_Backend::DrawRectangleFunction
    925 GPU_SW_Backend::GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable, bool transparency_enable)
    926 {
    927   static constexpr DrawRectangleFunction funcs[2][2][2] = {
    928     {{&GPU_SW_Backend::DrawRectangle<false, false, false>, &GPU_SW_Backend::DrawRectangle<false, false, true>},
    929      {&GPU_SW_Backend::DrawRectangle<false, false, false>, &GPU_SW_Backend::DrawRectangle<false, false, true>}},
    930     {{&GPU_SW_Backend::DrawRectangle<true, false, false>, &GPU_SW_Backend::DrawRectangle<true, false, true>},
    931      {&GPU_SW_Backend::DrawRectangle<true, true, false>, &GPU_SW_Backend::DrawRectangle<true, true, true>}}};
    932 
    933   return funcs[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
    934 }
    935 
    936 GPU_SW_Backend::DrawTriangleFunction GPU_SW_Backend::GetDrawTriangleFunction(bool shading_enable, bool texture_enable,
    937                                                                              bool raw_texture_enable,
    938                                                                              bool transparency_enable,
    939                                                                              bool dithering_enable)
    940 {
    941   static constexpr DrawTriangleFunction funcs[2][2][2][2][2] = {
    942     {{{{&GPU_SW_Backend::DrawTriangle<false, false, false, false, false>,
    943         &GPU_SW_Backend::DrawTriangle<false, false, false, false, true>},
    944        {&GPU_SW_Backend::DrawTriangle<false, false, false, true, false>,
    945         &GPU_SW_Backend::DrawTriangle<false, false, false, true, true>}},
    946       {{&GPU_SW_Backend::DrawTriangle<false, false, false, false, false>,
    947         &GPU_SW_Backend::DrawTriangle<false, false, false, false, false>},
    948        {&GPU_SW_Backend::DrawTriangle<false, false, false, true, false>,
    949         &GPU_SW_Backend::DrawTriangle<false, false, false, true, false>}}},
    950      {{{&GPU_SW_Backend::DrawTriangle<false, true, false, false, false>,
    951         &GPU_SW_Backend::DrawTriangle<false, true, false, false, true>},
    952        {&GPU_SW_Backend::DrawTriangle<false, true, false, true, false>,
    953         &GPU_SW_Backend::DrawTriangle<false, true, false, true, true>}},
    954       {{&GPU_SW_Backend::DrawTriangle<false, true, true, false, false>,
    955         &GPU_SW_Backend::DrawTriangle<false, true, true, false, false>},
    956        {&GPU_SW_Backend::DrawTriangle<false, true, true, true, false>,
    957         &GPU_SW_Backend::DrawTriangle<false, true, true, true, false>}}}},
    958     {{{{&GPU_SW_Backend::DrawTriangle<true, false, false, false, false>,
    959         &GPU_SW_Backend::DrawTriangle<true, false, false, false, true>},
    960        {&GPU_SW_Backend::DrawTriangle<true, false, false, true, false>,
    961         &GPU_SW_Backend::DrawTriangle<true, false, false, true, true>}},
    962       {{&GPU_SW_Backend::DrawTriangle<true, false, false, false, false>,
    963         &GPU_SW_Backend::DrawTriangle<true, false, false, false, false>},
    964        {&GPU_SW_Backend::DrawTriangle<true, false, false, true, false>,
    965         &GPU_SW_Backend::DrawTriangle<true, false, false, true, false>}}},
    966      {{{&GPU_SW_Backend::DrawTriangle<true, true, false, false, false>,
    967         &GPU_SW_Backend::DrawTriangle<true, true, false, false, true>},
    968        {&GPU_SW_Backend::DrawTriangle<true, true, false, true, false>,
    969         &GPU_SW_Backend::DrawTriangle<true, true, false, true, true>}},
    970       {{&GPU_SW_Backend::DrawTriangle<true, true, true, false, false>,
    971         &GPU_SW_Backend::DrawTriangle<true, true, true, false, false>},
    972        {&GPU_SW_Backend::DrawTriangle<true, true, true, true, false>,
    973         &GPU_SW_Backend::DrawTriangle<true, true, true, true, false>}}}}};
    974 
    975   return funcs[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]
    976               [u8(dithering_enable)];
    977 }