SDL_blit_A.c (51115B)
1 /* 2 Simple DirectMedia Layer 3 Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org> 4 5 This software is provided 'as-is', without any express or implied 6 warranty. In no event will the authors be held liable for any damages 7 arising from the use of this software. 8 9 Permission is granted to anyone to use this software for any purpose, 10 including commercial applications, and to alter it and redistribute it 11 freely, subject to the following restrictions: 12 13 1. The origin of this software must not be misrepresented; you must not 14 claim that you wrote the original software. If you use this software 15 in a product, an acknowledgment in the product documentation would be 16 appreciated but is not required. 17 2. Altered source versions must be plainly marked as such, and must not be 18 misrepresented as being the original software. 19 3. This notice may not be removed or altered from any source distribution. 20 */ 21 #include "../SDL_internal.h" 22 23 #if SDL_HAVE_BLIT_A 24 25 #include "SDL_video.h" 26 #include "SDL_blit.h" 27 28 /* Functions to perform alpha blended blitting */ 29 30 /* N->1 blending with per-surface alpha */ 31 static void 32 BlitNto1SurfaceAlpha(SDL_BlitInfo * info) 33 { 34 int width = info->dst_w; 35 int height = info->dst_h; 36 Uint8 *src = info->src; 37 int srcskip = info->src_skip; 38 Uint8 *dst = info->dst; 39 int dstskip = info->dst_skip; 40 Uint8 *palmap = info->table; 41 SDL_PixelFormat *srcfmt = info->src_fmt; 42 SDL_PixelFormat *dstfmt = info->dst_fmt; 43 int srcbpp = srcfmt->BytesPerPixel; 44 Uint32 Pixel; 45 unsigned sR, sG, sB; 46 unsigned dR, dG, dB; 47 const unsigned A = info->a; 48 49 while (height--) { 50 /* *INDENT-OFF* */ 51 DUFFS_LOOP4( 52 { 53 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 54 dR = dstfmt->palette->colors[*dst].r; 55 dG = dstfmt->palette->colors[*dst].g; 56 dB = dstfmt->palette->colors[*dst].b; 57 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB); 58 dR &= 0xff; 59 dG &= 0xff; 60 dB &= 0xff; 61 /* Pack RGB into 8bit pixel */ 62 if ( palmap == NULL ) { 63 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)); 64 } else { 65 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; 66 } 67 dst++; 68 src += srcbpp; 69 }, 70 width); 71 /* *INDENT-ON* */ 72 src += srcskip; 73 dst += dstskip; 74 } 75 } 76 77 /* N->1 blending with pixel alpha */ 78 static void 79 BlitNto1PixelAlpha(SDL_BlitInfo * info) 80 { 81 int width = info->dst_w; 82 int height = info->dst_h; 83 Uint8 *src = info->src; 84 int srcskip = info->src_skip; 85 Uint8 *dst = info->dst; 86 int dstskip = info->dst_skip; 87 Uint8 *palmap = info->table; 88 SDL_PixelFormat *srcfmt = info->src_fmt; 89 SDL_PixelFormat *dstfmt = info->dst_fmt; 90 int srcbpp = srcfmt->BytesPerPixel; 91 Uint32 Pixel; 92 unsigned sR, sG, sB, sA; 93 unsigned dR, dG, dB; 94 95 while (height--) { 96 /* *INDENT-OFF* */ 97 DUFFS_LOOP4( 98 { 99 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA); 100 dR = dstfmt->palette->colors[*dst].r; 101 dG = dstfmt->palette->colors[*dst].g; 102 dB = dstfmt->palette->colors[*dst].b; 103 ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB); 104 dR &= 0xff; 105 dG &= 0xff; 106 dB &= 0xff; 107 /* Pack RGB into 8bit pixel */ 108 if ( palmap == NULL ) { 109 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)); 110 } else { 111 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; 112 } 113 dst++; 114 src += srcbpp; 115 }, 116 width); 117 /* *INDENT-ON* */ 118 src += srcskip; 119 dst += dstskip; 120 } 121 } 122 123 /* colorkeyed N->1 blending with per-surface alpha */ 124 static void 125 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info) 126 { 127 int width = info->dst_w; 128 int height = info->dst_h; 129 Uint8 *src = info->src; 130 int srcskip = info->src_skip; 131 Uint8 *dst = info->dst; 132 int dstskip = info->dst_skip; 133 Uint8 *palmap = info->table; 134 SDL_PixelFormat *srcfmt = info->src_fmt; 135 SDL_PixelFormat *dstfmt = info->dst_fmt; 136 int srcbpp = srcfmt->BytesPerPixel; 137 Uint32 ckey = info->colorkey; 138 Uint32 Pixel; 139 unsigned sR, sG, sB; 140 unsigned dR, dG, dB; 141 const unsigned A = info->a; 142 143 while (height--) { 144 /* *INDENT-OFF* */ 145 DUFFS_LOOP( 146 { 147 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 148 if ( Pixel != ckey ) { 149 dR = dstfmt->palette->colors[*dst].r; 150 dG = dstfmt->palette->colors[*dst].g; 151 dB = dstfmt->palette->colors[*dst].b; 152 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB); 153 dR &= 0xff; 154 dG &= 0xff; 155 dB &= 0xff; 156 /* Pack RGB into 8bit pixel */ 157 if ( palmap == NULL ) { 158 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)); 159 } else { 160 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; 161 } 162 } 163 dst++; 164 src += srcbpp; 165 }, 166 width); 167 /* *INDENT-ON* */ 168 src += srcskip; 169 dst += dstskip; 170 } 171 } 172 173 #ifdef __MMX__ 174 175 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 176 static void 177 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info) 178 { 179 int width = info->dst_w; 180 int height = info->dst_h; 181 Uint32 *srcp = (Uint32 *) info->src; 182 int srcskip = info->src_skip >> 2; 183 Uint32 *dstp = (Uint32 *) info->dst; 184 int dstskip = info->dst_skip >> 2; 185 Uint32 dalpha = info->dst_fmt->Amask; 186 187 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta; 188 189 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */ 190 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */ 191 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ 192 193 while (height--) { 194 int n = width; 195 if (n & 1) { 196 Uint32 s = *srcp++; 197 Uint32 d = *dstp; 198 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 199 + (s & d & 0x00010101)) | dalpha; 200 n--; 201 } 202 203 for (n >>= 1; n > 0; --n) { 204 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */ 205 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ 206 207 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */ 208 src2 = src1; /* 2 x src -> src2(ARGBARGB) */ 209 210 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */ 211 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */ 212 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */ 213 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */ 214 215 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */ 216 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */ 217 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */ 218 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */ 219 220 *(__m64 *) dstp = dst1; /* dst1 -> 2 x dst pixels */ 221 dstp += 2; 222 srcp += 2; 223 } 224 225 srcp += srcskip; 226 dstp += dstskip; 227 } 228 _mm_empty(); 229 } 230 231 /* fast RGB888->(A)RGB888 blending with surface alpha */ 232 static void 233 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info) 234 { 235 SDL_PixelFormat *df = info->dst_fmt; 236 Uint32 chanmask; 237 unsigned alpha = info->a; 238 239 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { 240 /* only call a128 version when R,G,B occupy lower bits */ 241 BlitRGBtoRGBSurfaceAlpha128MMX(info); 242 } else { 243 int width = info->dst_w; 244 int height = info->dst_h; 245 Uint32 *srcp = (Uint32 *) info->src; 246 int srcskip = info->src_skip >> 2; 247 Uint32 *dstp = (Uint32 *) info->dst; 248 int dstskip = info->dst_skip >> 2; 249 Uint32 dalpha = df->Amask; 250 Uint32 amult; 251 252 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta; 253 254 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 255 /* form the alpha mult */ 256 amult = alpha | (alpha << 8); 257 amult = amult | (amult << 16); 258 chanmask = 259 (0xff << df->Rshift) | (0xff << df-> 260 Gshift) | (0xff << df->Bshift); 261 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */ 262 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */ 263 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */ 264 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ 265 266 while (height--) { 267 int n = width; 268 if (n & 1) { 269 /* One Pixel Blend */ 270 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */ 271 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */ 272 273 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */ 274 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 275 276 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */ 277 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 278 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ 279 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */ 280 281 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ 282 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ 283 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 284 285 ++srcp; 286 ++dstp; 287 288 n--; 289 } 290 291 for (n >>= 1; n > 0; --n) { 292 /* Two Pixels Blend */ 293 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */ 294 src2 = src1; /* 2 x src -> src2(ARGBARGB) */ 295 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */ 296 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */ 297 298 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */ 299 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ 300 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */ 301 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */ 302 303 src1 = _mm_sub_pi16(src1, dst1); /* src1 - dst1 -> src1 */ 304 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */ 305 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */ 306 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */ 307 308 src2 = _mm_sub_pi16(src2, dst2); /* src2 - dst2 -> src2 */ 309 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 310 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ 311 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */ 312 313 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */ 314 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ 315 316 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */ 317 318 srcp += 2; 319 dstp += 2; 320 } 321 srcp += srcskip; 322 dstp += dstskip; 323 } 324 _mm_empty(); 325 } 326 } 327 328 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 329 static void 330 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info) 331 { 332 int width = info->dst_w; 333 int height = info->dst_h; 334 Uint32 *srcp = (Uint32 *) info->src; 335 int srcskip = info->src_skip >> 2; 336 Uint32 *dstp = (Uint32 *) info->dst; 337 int dstskip = info->dst_skip >> 2; 338 SDL_PixelFormat *sf = info->src_fmt; 339 Uint32 amask = sf->Amask; 340 Uint32 ashift = sf->Ashift; 341 Uint64 multmask, multmask2; 342 343 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2; 344 345 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 346 multmask = 0x00FF; 347 multmask <<= (ashift * 2); 348 multmask2 = 0x00FF00FF00FF00FFULL; 349 350 while (height--) { 351 /* *INDENT-OFF* */ 352 DUFFS_LOOP4({ 353 Uint32 alpha = *srcp & amask; 354 if (alpha == 0) { 355 /* do nothing */ 356 } else if (alpha == amask) { 357 *dstp = *srcp; 358 } else { 359 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */ 360 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ 361 362 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */ 363 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 364 365 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ 366 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ 367 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 368 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */ 369 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */ 370 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */ 371 372 /* blend */ 373 src1 = _mm_mullo_pi16(src1, mm_alpha); 374 src1 = _mm_srli_pi16(src1, 8); 375 dst1 = _mm_mullo_pi16(dst1, mm_alpha2); 376 dst1 = _mm_srli_pi16(dst1, 8); 377 dst1 = _mm_add_pi16(src1, dst1); 378 dst1 = _mm_packs_pu16(dst1, mm_zero); 379 380 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 381 } 382 ++srcp; 383 ++dstp; 384 }, width); 385 /* *INDENT-ON* */ 386 srcp += srcskip; 387 dstp += dstskip; 388 } 389 _mm_empty(); 390 } 391 392 #endif /* __MMX__ */ 393 394 #if SDL_ARM_SIMD_BLITTERS 395 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); 396 397 static void 398 BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info) 399 { 400 int32_t width = info->dst_w; 401 int32_t height = info->dst_h; 402 uint16_t *dstp = (uint16_t *)info->dst; 403 int32_t dststride = width + (info->dst_skip >> 1); 404 uint32_t *srcp = (uint32_t *)info->src; 405 int32_t srcstride = width + (info->src_skip >> 2); 406 407 BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); 408 } 409 410 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); 411 412 static void 413 BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info) 414 { 415 int32_t width = info->dst_w; 416 int32_t height = info->dst_h; 417 uint32_t *dstp = (uint32_t *)info->dst; 418 int32_t dststride = width + (info->dst_skip >> 2); 419 uint32_t *srcp = (uint32_t *)info->src; 420 int32_t srcstride = width + (info->src_skip >> 2); 421 422 BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); 423 } 424 #endif 425 426 #if SDL_ARM_NEON_BLITTERS 427 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); 428 429 static void 430 BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info) 431 { 432 int32_t width = info->dst_w; 433 int32_t height = info->dst_h; 434 uint16_t *dstp = (uint16_t *)info->dst; 435 int32_t dststride = width + (info->dst_skip >> 1); 436 uint32_t *srcp = (uint32_t *)info->src; 437 int32_t srcstride = width + (info->src_skip >> 2); 438 439 BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); 440 } 441 442 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); 443 444 static void 445 BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info) 446 { 447 int32_t width = info->dst_w; 448 int32_t height = info->dst_h; 449 uint32_t *dstp = (uint32_t *)info->dst; 450 int32_t dststride = width + (info->dst_skip >> 2); 451 uint32_t *srcp = (uint32_t *)info->src; 452 int32_t srcstride = width + (info->src_skip >> 2); 453 454 BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); 455 } 456 #endif 457 458 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 459 static void 460 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info) 461 { 462 int width = info->dst_w; 463 int height = info->dst_h; 464 Uint32 *srcp = (Uint32 *) info->src; 465 int srcskip = info->src_skip >> 2; 466 Uint32 *dstp = (Uint32 *) info->dst; 467 int dstskip = info->dst_skip >> 2; 468 469 while (height--) { 470 /* *INDENT-OFF* */ 471 DUFFS_LOOP4({ 472 Uint32 s = *srcp++; 473 Uint32 d = *dstp; 474 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 475 + (s & d & 0x00010101)) | 0xff000000; 476 }, width); 477 /* *INDENT-ON* */ 478 srcp += srcskip; 479 dstp += dstskip; 480 } 481 } 482 483 /* fast RGB888->(A)RGB888 blending with surface alpha */ 484 static void 485 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info) 486 { 487 unsigned alpha = info->a; 488 if (alpha == 128) { 489 BlitRGBtoRGBSurfaceAlpha128(info); 490 } else { 491 int width = info->dst_w; 492 int height = info->dst_h; 493 Uint32 *srcp = (Uint32 *) info->src; 494 int srcskip = info->src_skip >> 2; 495 Uint32 *dstp = (Uint32 *) info->dst; 496 int dstskip = info->dst_skip >> 2; 497 Uint32 s; 498 Uint32 d; 499 Uint32 s1; 500 Uint32 d1; 501 502 while (height--) { 503 /* *INDENT-OFF* */ 504 DUFFS_LOOP4({ 505 s = *srcp; 506 d = *dstp; 507 s1 = s & 0xff00ff; 508 d1 = d & 0xff00ff; 509 d1 = (d1 + ((s1 - d1) * alpha >> 8)) 510 & 0xff00ff; 511 s &= 0xff00; 512 d &= 0xff00; 513 d = (d + ((s - d) * alpha >> 8)) & 0xff00; 514 *dstp = d1 | d | 0xff000000; 515 ++srcp; 516 ++dstp; 517 }, width); 518 /* *INDENT-ON* */ 519 srcp += srcskip; 520 dstp += dstskip; 521 } 522 } 523 } 524 525 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ 526 static void 527 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info) 528 { 529 int width = info->dst_w; 530 int height = info->dst_h; 531 Uint32 *srcp = (Uint32 *) info->src; 532 int srcskip = info->src_skip >> 2; 533 Uint32 *dstp = (Uint32 *) info->dst; 534 int dstskip = info->dst_skip >> 2; 535 536 while (height--) { 537 /* *INDENT-OFF* */ 538 DUFFS_LOOP4({ 539 Uint32 dalpha; 540 Uint32 d; 541 Uint32 s1; 542 Uint32 d1; 543 Uint32 s = *srcp; 544 Uint32 alpha = s >> 24; 545 /* FIXME: Here we special-case opaque alpha since the 546 compositioning used (>>8 instead of /255) doesn't handle 547 it correctly. Also special-case alpha=0 for speed? 548 Benchmark this! */ 549 if (alpha) { 550 if (alpha == SDL_ALPHA_OPAQUE) { 551 *dstp = *srcp; 552 } else { 553 /* 554 * take out the middle component (green), and process 555 * the other two in parallel. One multiply less. 556 */ 557 d = *dstp; 558 dalpha = d >> 24; 559 s1 = s & 0xff00ff; 560 d1 = d & 0xff00ff; 561 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; 562 s &= 0xff00; 563 d &= 0xff00; 564 d = (d + ((s - d) * alpha >> 8)) & 0xff00; 565 dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8); 566 *dstp = d1 | d | (dalpha << 24); 567 } 568 } 569 ++srcp; 570 ++dstp; 571 }, width); 572 /* *INDENT-ON* */ 573 srcp += srcskip; 574 dstp += dstskip; 575 } 576 } 577 578 #ifdef __3dNOW__ 579 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ 580 static void 581 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info) 582 { 583 int width = info->dst_w; 584 int height = info->dst_h; 585 Uint32 *srcp = (Uint32 *) info->src; 586 int srcskip = info->src_skip >> 2; 587 Uint32 *dstp = (Uint32 *) info->dst; 588 int dstskip = info->dst_skip >> 2; 589 SDL_PixelFormat *sf = info->src_fmt; 590 Uint32 amask = sf->Amask; 591 Uint32 ashift = sf->Ashift; 592 Uint64 multmask, multmask2; 593 594 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2; 595 596 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 597 multmask = 0x00FF; 598 multmask <<= (ashift * 2); 599 multmask2 = 0x00FF00FF00FF00FFULL; 600 601 while (height--) { 602 /* *INDENT-OFF* */ 603 DUFFS_LOOP4({ 604 Uint32 alpha; 605 606 _m_prefetch(srcp + 16); 607 _m_prefetch(dstp + 16); 608 609 alpha = *srcp & amask; 610 if (alpha == 0) { 611 /* do nothing */ 612 } else if (alpha == amask) { 613 *dstp = *srcp; 614 } else { 615 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */ 616 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ 617 618 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */ 619 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ 620 621 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ 622 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ 623 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 624 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */ 625 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */ 626 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */ 627 628 629 /* blend */ 630 src1 = _mm_mullo_pi16(src1, mm_alpha); 631 src1 = _mm_srli_pi16(src1, 8); 632 dst1 = _mm_mullo_pi16(dst1, mm_alpha2); 633 dst1 = _mm_srli_pi16(dst1, 8); 634 dst1 = _mm_add_pi16(src1, dst1); 635 dst1 = _mm_packs_pu16(dst1, mm_zero); 636 637 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ 638 } 639 ++srcp; 640 ++dstp; 641 }, width); 642 /* *INDENT-ON* */ 643 srcp += srcskip; 644 dstp += dstskip; 645 } 646 _mm_empty(); 647 } 648 649 #endif /* __3dNOW__ */ 650 651 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ 652 653 /* blend a single 16 bit pixel at 50% */ 654 #define BLEND16_50(d, s, mask) \ 655 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff))) 656 657 /* blend two 16 bit pixels at 50% */ 658 #define BLEND2x16_50(d, s, mask) \ 659 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \ 660 + (s & d & (~(mask | mask << 16)))) 661 662 static void 663 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask) 664 { 665 int width = info->dst_w; 666 int height = info->dst_h; 667 Uint16 *srcp = (Uint16 *) info->src; 668 int srcskip = info->src_skip >> 1; 669 Uint16 *dstp = (Uint16 *) info->dst; 670 int dstskip = info->dst_skip >> 1; 671 672 while (height--) { 673 if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) { 674 /* 675 * Source and destination not aligned, pipeline it. 676 * This is mostly a win for big blits but no loss for 677 * small ones 678 */ 679 Uint32 prev_sw; 680 int w = width; 681 682 /* handle odd destination */ 683 if ((uintptr_t) dstp & 2) { 684 Uint16 d = *dstp, s = *srcp; 685 *dstp = BLEND16_50(d, s, mask); 686 dstp++; 687 srcp++; 688 w--; 689 } 690 srcp++; /* srcp is now 32-bit aligned */ 691 692 /* bootstrap pipeline with first halfword */ 693 prev_sw = ((Uint32 *) srcp)[-1]; 694 695 while (w > 1) { 696 Uint32 sw, dw, s; 697 sw = *(Uint32 *) srcp; 698 dw = *(Uint32 *) dstp; 699 #if SDL_BYTEORDER == SDL_BIG_ENDIAN 700 s = (prev_sw << 16) + (sw >> 16); 701 #else 702 s = (prev_sw >> 16) + (sw << 16); 703 #endif 704 prev_sw = sw; 705 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask); 706 dstp += 2; 707 srcp += 2; 708 w -= 2; 709 } 710 711 /* final pixel if any */ 712 if (w) { 713 Uint16 d = *dstp, s; 714 #if SDL_BYTEORDER == SDL_BIG_ENDIAN 715 s = (Uint16) prev_sw; 716 #else 717 s = (Uint16) (prev_sw >> 16); 718 #endif 719 *dstp = BLEND16_50(d, s, mask); 720 srcp++; 721 dstp++; 722 } 723 srcp += srcskip - 1; 724 dstp += dstskip; 725 } else { 726 /* source and destination are aligned */ 727 int w = width; 728 729 /* first odd pixel? */ 730 if ((uintptr_t) srcp & 2) { 731 Uint16 d = *dstp, s = *srcp; 732 *dstp = BLEND16_50(d, s, mask); 733 srcp++; 734 dstp++; 735 w--; 736 } 737 /* srcp and dstp are now 32-bit aligned */ 738 739 while (w > 1) { 740 Uint32 sw = *(Uint32 *) srcp; 741 Uint32 dw = *(Uint32 *) dstp; 742 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask); 743 srcp += 2; 744 dstp += 2; 745 w -= 2; 746 } 747 748 /* last odd pixel? */ 749 if (w) { 750 Uint16 d = *dstp, s = *srcp; 751 *dstp = BLEND16_50(d, s, mask); 752 srcp++; 753 dstp++; 754 } 755 srcp += srcskip; 756 dstp += dstskip; 757 } 758 } 759 } 760 761 #ifdef __MMX__ 762 763 /* fast RGB565->RGB565 blending with surface alpha */ 764 static void 765 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info) 766 { 767 unsigned alpha = info->a; 768 if (alpha == 128) { 769 Blit16to16SurfaceAlpha128(info, 0xf7de); 770 } else { 771 int width = info->dst_w; 772 int height = info->dst_h; 773 Uint16 *srcp = (Uint16 *) info->src; 774 int srcskip = info->src_skip >> 1; 775 Uint16 *dstp = (Uint16 *) info->dst; 776 int dstskip = info->dst_skip >> 1; 777 Uint32 s, d; 778 779 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; 780 781 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */ 782 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ 783 alpha >>= 3; /* downscale alpha to 5 bits */ 784 785 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 786 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 787 /* position alpha to allow for mullo and mulhi on diff channels 788 to reduce the number of operations */ 789 mm_alpha = _mm_slli_si64(mm_alpha, 3); 790 791 /* Setup the 565 color channel masks */ 792 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */ 793 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ 794 795 while (height--) { 796 /* *INDENT-OFF* */ 797 DUFFS_LOOP_124( 798 { 799 s = *srcp++; 800 d = *dstp; 801 /* 802 * shift out the middle component (green) to 803 * the high 16 bits, and process all three RGB 804 * components at the same time. 805 */ 806 s = (s | s << 16) & 0x07e0f81f; 807 d = (d | d << 16) & 0x07e0f81f; 808 d += (s - d) * alpha >> 5; 809 d &= 0x07e0f81f; 810 *dstp++ = (Uint16)(d | d >> 16); 811 },{ 812 s = *srcp++; 813 d = *dstp; 814 /* 815 * shift out the middle component (green) to 816 * the high 16 bits, and process all three RGB 817 * components at the same time. 818 */ 819 s = (s | s << 16) & 0x07e0f81f; 820 d = (d | d << 16) & 0x07e0f81f; 821 d += (s - d) * alpha >> 5; 822 d &= 0x07e0f81f; 823 *dstp++ = (Uint16)(d | d >> 16); 824 s = *srcp++; 825 d = *dstp; 826 /* 827 * shift out the middle component (green) to 828 * the high 16 bits, and process all three RGB 829 * components at the same time. 830 */ 831 s = (s | s << 16) & 0x07e0f81f; 832 d = (d | d << 16) & 0x07e0f81f; 833 d += (s - d) * alpha >> 5; 834 d &= 0x07e0f81f; 835 *dstp++ = (Uint16)(d | d >> 16); 836 },{ 837 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ 838 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ 839 840 /* red */ 841 src2 = src1; 842 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */ 843 844 dst2 = dst1; 845 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */ 846 847 /* blend */ 848 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 849 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 850 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 851 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 852 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */ 853 854 mm_res = dst2; /* RED -> mm_res */ 855 856 /* green -- process the bits in place */ 857 src2 = src1; 858 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ 859 860 dst2 = dst1; 861 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ 862 863 /* blend */ 864 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 865 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 866 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 867 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 868 869 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ 870 871 /* blue */ 872 src2 = src1; 873 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ 874 875 dst2 = dst1; 876 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ 877 878 /* blend */ 879 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 880 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 881 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 882 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 883 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ 884 885 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ 886 887 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ 888 889 srcp += 4; 890 dstp += 4; 891 }, width); 892 /* *INDENT-ON* */ 893 srcp += srcskip; 894 dstp += dstskip; 895 } 896 _mm_empty(); 897 } 898 } 899 900 /* fast RGB555->RGB555 blending with surface alpha */ 901 static void 902 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info) 903 { 904 unsigned alpha = info->a; 905 if (alpha == 128) { 906 Blit16to16SurfaceAlpha128(info, 0xfbde); 907 } else { 908 int width = info->dst_w; 909 int height = info->dst_h; 910 Uint16 *srcp = (Uint16 *) info->src; 911 int srcskip = info->src_skip >> 1; 912 Uint16 *dstp = (Uint16 *) info->dst; 913 int dstskip = info->dst_skip >> 1; 914 Uint32 s, d; 915 916 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; 917 918 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */ 919 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ 920 alpha >>= 3; /* downscale alpha to 5 bits */ 921 922 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ 923 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ 924 /* position alpha to allow for mullo and mulhi on diff channels 925 to reduce the number of operations */ 926 mm_alpha = _mm_slli_si64(mm_alpha, 3); 927 928 /* Setup the 555 color channel masks */ 929 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */ 930 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */ 931 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ 932 933 while (height--) { 934 /* *INDENT-OFF* */ 935 DUFFS_LOOP_124( 936 { 937 s = *srcp++; 938 d = *dstp; 939 /* 940 * shift out the middle component (green) to 941 * the high 16 bits, and process all three RGB 942 * components at the same time. 943 */ 944 s = (s | s << 16) & 0x03e07c1f; 945 d = (d | d << 16) & 0x03e07c1f; 946 d += (s - d) * alpha >> 5; 947 d &= 0x03e07c1f; 948 *dstp++ = (Uint16)(d | d >> 16); 949 },{ 950 s = *srcp++; 951 d = *dstp; 952 /* 953 * shift out the middle component (green) to 954 * the high 16 bits, and process all three RGB 955 * components at the same time. 956 */ 957 s = (s | s << 16) & 0x03e07c1f; 958 d = (d | d << 16) & 0x03e07c1f; 959 d += (s - d) * alpha >> 5; 960 d &= 0x03e07c1f; 961 *dstp++ = (Uint16)(d | d >> 16); 962 s = *srcp++; 963 d = *dstp; 964 /* 965 * shift out the middle component (green) to 966 * the high 16 bits, and process all three RGB 967 * components at the same time. 968 */ 969 s = (s | s << 16) & 0x03e07c1f; 970 d = (d | d << 16) & 0x03e07c1f; 971 d += (s - d) * alpha >> 5; 972 d &= 0x03e07c1f; 973 *dstp++ = (Uint16)(d | d >> 16); 974 },{ 975 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ 976 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ 977 978 /* red -- process the bits in place */ 979 src2 = src1; 980 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */ 981 982 dst2 = dst1; 983 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */ 984 985 /* blend */ 986 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 987 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 988 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 989 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 990 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */ 991 992 mm_res = dst2; /* RED -> mm_res */ 993 994 /* green -- process the bits in place */ 995 src2 = src1; 996 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ 997 998 dst2 = dst1; 999 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ 1000 1001 /* blend */ 1002 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 1003 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 1004 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ 1005 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 1006 1007 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ 1008 1009 /* blue */ 1010 src2 = src1; /* src -> src2 */ 1011 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ 1012 1013 dst2 = dst1; /* dst -> dst2 */ 1014 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ 1015 1016 /* blend */ 1017 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ 1018 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 1019 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ 1020 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ 1021 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ 1022 1023 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ 1024 1025 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ 1026 1027 srcp += 4; 1028 dstp += 4; 1029 }, width); 1030 /* *INDENT-ON* */ 1031 srcp += srcskip; 1032 dstp += dstskip; 1033 } 1034 _mm_empty(); 1035 } 1036 } 1037 1038 #endif /* __MMX__ */ 1039 1040 /* fast RGB565->RGB565 blending with surface alpha */ 1041 static void 1042 Blit565to565SurfaceAlpha(SDL_BlitInfo * info) 1043 { 1044 unsigned alpha = info->a; 1045 if (alpha == 128) { 1046 Blit16to16SurfaceAlpha128(info, 0xf7de); 1047 } else { 1048 int width = info->dst_w; 1049 int height = info->dst_h; 1050 Uint16 *srcp = (Uint16 *) info->src; 1051 int srcskip = info->src_skip >> 1; 1052 Uint16 *dstp = (Uint16 *) info->dst; 1053 int dstskip = info->dst_skip >> 1; 1054 alpha >>= 3; /* downscale alpha to 5 bits */ 1055 1056 while (height--) { 1057 /* *INDENT-OFF* */ 1058 DUFFS_LOOP4({ 1059 Uint32 s = *srcp++; 1060 Uint32 d = *dstp; 1061 /* 1062 * shift out the middle component (green) to 1063 * the high 16 bits, and process all three RGB 1064 * components at the same time. 1065 */ 1066 s = (s | s << 16) & 0x07e0f81f; 1067 d = (d | d << 16) & 0x07e0f81f; 1068 d += (s - d) * alpha >> 5; 1069 d &= 0x07e0f81f; 1070 *dstp++ = (Uint16)(d | d >> 16); 1071 }, width); 1072 /* *INDENT-ON* */ 1073 srcp += srcskip; 1074 dstp += dstskip; 1075 } 1076 } 1077 } 1078 1079 /* fast RGB555->RGB555 blending with surface alpha */ 1080 static void 1081 Blit555to555SurfaceAlpha(SDL_BlitInfo * info) 1082 { 1083 unsigned alpha = info->a; /* downscale alpha to 5 bits */ 1084 if (alpha == 128) { 1085 Blit16to16SurfaceAlpha128(info, 0xfbde); 1086 } else { 1087 int width = info->dst_w; 1088 int height = info->dst_h; 1089 Uint16 *srcp = (Uint16 *) info->src; 1090 int srcskip = info->src_skip >> 1; 1091 Uint16 *dstp = (Uint16 *) info->dst; 1092 int dstskip = info->dst_skip >> 1; 1093 alpha >>= 3; /* downscale alpha to 5 bits */ 1094 1095 while (height--) { 1096 /* *INDENT-OFF* */ 1097 DUFFS_LOOP4({ 1098 Uint32 s = *srcp++; 1099 Uint32 d = *dstp; 1100 /* 1101 * shift out the middle component (green) to 1102 * the high 16 bits, and process all three RGB 1103 * components at the same time. 1104 */ 1105 s = (s | s << 16) & 0x03e07c1f; 1106 d = (d | d << 16) & 0x03e07c1f; 1107 d += (s - d) * alpha >> 5; 1108 d &= 0x03e07c1f; 1109 *dstp++ = (Uint16)(d | d >> 16); 1110 }, width); 1111 /* *INDENT-ON* */ 1112 srcp += srcskip; 1113 dstp += dstskip; 1114 } 1115 } 1116 } 1117 1118 /* fast ARGB8888->RGB565 blending with pixel alpha */ 1119 static void 1120 BlitARGBto565PixelAlpha(SDL_BlitInfo * info) 1121 { 1122 int width = info->dst_w; 1123 int height = info->dst_h; 1124 Uint32 *srcp = (Uint32 *) info->src; 1125 int srcskip = info->src_skip >> 2; 1126 Uint16 *dstp = (Uint16 *) info->dst; 1127 int dstskip = info->dst_skip >> 1; 1128 1129 while (height--) { 1130 /* *INDENT-OFF* */ 1131 DUFFS_LOOP4({ 1132 Uint32 s = *srcp; 1133 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */ 1134 /* FIXME: Here we special-case opaque alpha since the 1135 compositioning used (>>8 instead of /255) doesn't handle 1136 it correctly. Also special-case alpha=0 for speed? 1137 Benchmark this! */ 1138 if(alpha) { 1139 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { 1140 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f)); 1141 } else { 1142 Uint32 d = *dstp; 1143 /* 1144 * convert source and destination to G0RAB65565 1145 * and blend all components at the same time 1146 */ 1147 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) 1148 + (s >> 3 & 0x1f); 1149 d = (d | d << 16) & 0x07e0f81f; 1150 d += (s - d) * alpha >> 5; 1151 d &= 0x07e0f81f; 1152 *dstp = (Uint16)(d | d >> 16); 1153 } 1154 } 1155 srcp++; 1156 dstp++; 1157 }, width); 1158 /* *INDENT-ON* */ 1159 srcp += srcskip; 1160 dstp += dstskip; 1161 } 1162 } 1163 1164 /* fast ARGB8888->RGB555 blending with pixel alpha */ 1165 static void 1166 BlitARGBto555PixelAlpha(SDL_BlitInfo * info) 1167 { 1168 int width = info->dst_w; 1169 int height = info->dst_h; 1170 Uint32 *srcp = (Uint32 *) info->src; 1171 int srcskip = info->src_skip >> 2; 1172 Uint16 *dstp = (Uint16 *) info->dst; 1173 int dstskip = info->dst_skip >> 1; 1174 1175 while (height--) { 1176 /* *INDENT-OFF* */ 1177 DUFFS_LOOP4({ 1178 unsigned alpha; 1179 Uint32 s = *srcp; 1180 alpha = s >> 27; /* downscale alpha to 5 bits */ 1181 /* FIXME: Here we special-case opaque alpha since the 1182 compositioning used (>>8 instead of /255) doesn't handle 1183 it correctly. Also special-case alpha=0 for speed? 1184 Benchmark this! */ 1185 if(alpha) { 1186 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { 1187 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f)); 1188 } else { 1189 Uint32 d = *dstp; 1190 /* 1191 * convert source and destination to G0RAB65565 1192 * and blend all components at the same time 1193 */ 1194 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) 1195 + (s >> 3 & 0x1f); 1196 d = (d | d << 16) & 0x03e07c1f; 1197 d += (s - d) * alpha >> 5; 1198 d &= 0x03e07c1f; 1199 *dstp = (Uint16)(d | d >> 16); 1200 } 1201 } 1202 srcp++; 1203 dstp++; 1204 }, width); 1205 /* *INDENT-ON* */ 1206 srcp += srcskip; 1207 dstp += dstskip; 1208 } 1209 } 1210 1211 /* General (slow) N->N blending with per-surface alpha */ 1212 static void 1213 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info) 1214 { 1215 int width = info->dst_w; 1216 int height = info->dst_h; 1217 Uint8 *src = info->src; 1218 int srcskip = info->src_skip; 1219 Uint8 *dst = info->dst; 1220 int dstskip = info->dst_skip; 1221 SDL_PixelFormat *srcfmt = info->src_fmt; 1222 SDL_PixelFormat *dstfmt = info->dst_fmt; 1223 int srcbpp = srcfmt->BytesPerPixel; 1224 int dstbpp = dstfmt->BytesPerPixel; 1225 Uint32 Pixel; 1226 unsigned sR, sG, sB; 1227 unsigned dR, dG, dB, dA; 1228 const unsigned sA = info->a; 1229 1230 if (sA) { 1231 while (height--) { 1232 /* *INDENT-OFF* */ 1233 DUFFS_LOOP4( 1234 { 1235 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 1236 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); 1237 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); 1238 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 1239 src += srcbpp; 1240 dst += dstbpp; 1241 }, 1242 width); 1243 /* *INDENT-ON* */ 1244 src += srcskip; 1245 dst += dstskip; 1246 } 1247 } 1248 } 1249 1250 /* General (slow) colorkeyed N->N blending with per-surface alpha */ 1251 static void 1252 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info) 1253 { 1254 int width = info->dst_w; 1255 int height = info->dst_h; 1256 Uint8 *src = info->src; 1257 int srcskip = info->src_skip; 1258 Uint8 *dst = info->dst; 1259 int dstskip = info->dst_skip; 1260 SDL_PixelFormat *srcfmt = info->src_fmt; 1261 SDL_PixelFormat *dstfmt = info->dst_fmt; 1262 Uint32 ckey = info->colorkey; 1263 int srcbpp = srcfmt->BytesPerPixel; 1264 int dstbpp = dstfmt->BytesPerPixel; 1265 Uint32 Pixel; 1266 unsigned sR, sG, sB; 1267 unsigned dR, dG, dB, dA; 1268 const unsigned sA = info->a; 1269 1270 while (height--) { 1271 /* *INDENT-OFF* */ 1272 DUFFS_LOOP4( 1273 { 1274 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel); 1275 if(sA && Pixel != ckey) { 1276 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); 1277 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); 1278 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); 1279 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 1280 } 1281 src += srcbpp; 1282 dst += dstbpp; 1283 }, 1284 width); 1285 /* *INDENT-ON* */ 1286 src += srcskip; 1287 dst += dstskip; 1288 } 1289 } 1290 1291 /* General (slow) N->N blending with pixel alpha */ 1292 static void 1293 BlitNtoNPixelAlpha(SDL_BlitInfo * info) 1294 { 1295 int width = info->dst_w; 1296 int height = info->dst_h; 1297 Uint8 *src = info->src; 1298 int srcskip = info->src_skip; 1299 Uint8 *dst = info->dst; 1300 int dstskip = info->dst_skip; 1301 SDL_PixelFormat *srcfmt = info->src_fmt; 1302 SDL_PixelFormat *dstfmt = info->dst_fmt; 1303 int srcbpp; 1304 int dstbpp; 1305 Uint32 Pixel; 1306 unsigned sR, sG, sB, sA; 1307 unsigned dR, dG, dB, dA; 1308 1309 /* Set up some basic variables */ 1310 srcbpp = srcfmt->BytesPerPixel; 1311 dstbpp = dstfmt->BytesPerPixel; 1312 1313 while (height--) { 1314 /* *INDENT-OFF* */ 1315 DUFFS_LOOP4( 1316 { 1317 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); 1318 if(sA) { 1319 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); 1320 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); 1321 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 1322 } 1323 src += srcbpp; 1324 dst += dstbpp; 1325 }, 1326 width); 1327 /* *INDENT-ON* */ 1328 src += srcskip; 1329 dst += dstskip; 1330 } 1331 } 1332 1333 1334 SDL_BlitFunc 1335 SDL_CalculateBlitA(SDL_Surface * surface) 1336 { 1337 SDL_PixelFormat *sf = surface->format; 1338 SDL_PixelFormat *df = surface->map->dst->format; 1339 1340 switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) { 1341 case SDL_COPY_BLEND: 1342 /* Per-pixel alpha blits */ 1343 switch (df->BytesPerPixel) { 1344 case 1: 1345 if (df->palette != NULL) { 1346 return BlitNto1PixelAlpha; 1347 } else { 1348 /* RGB332 has no palette ! */ 1349 return BlitNtoNPixelAlpha; 1350 } 1351 1352 case 2: 1353 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS 1354 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 1355 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0 1356 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) 1357 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) 1358 { 1359 #if SDL_ARM_NEON_BLITTERS 1360 if (SDL_HasNEON()) 1361 return BlitARGBto565PixelAlphaARMNEON; 1362 #endif 1363 #if SDL_ARM_SIMD_BLITTERS 1364 if (SDL_HasARMSIMD()) 1365 return BlitARGBto565PixelAlphaARMSIMD; 1366 #endif 1367 } 1368 #endif 1369 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 1370 && sf->Gmask == 0xff00 1371 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) 1372 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { 1373 if (df->Gmask == 0x7e0) 1374 return BlitARGBto565PixelAlpha; 1375 else if (df->Gmask == 0x3e0) 1376 return BlitARGBto555PixelAlpha; 1377 } 1378 return BlitNtoNPixelAlpha; 1379 1380 case 4: 1381 if (sf->Rmask == df->Rmask 1382 && sf->Gmask == df->Gmask 1383 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { 1384 #if defined(__MMX__) || defined(__3dNOW__) 1385 if (sf->Rshift % 8 == 0 1386 && sf->Gshift % 8 == 0 1387 && sf->Bshift % 8 == 0 1388 && sf->Ashift % 8 == 0 && sf->Aloss == 0) { 1389 #ifdef __3dNOW__ 1390 if (SDL_Has3DNow()) 1391 return BlitRGBtoRGBPixelAlphaMMX3DNOW; 1392 #endif 1393 #ifdef __MMX__ 1394 if (SDL_HasMMX()) 1395 return BlitRGBtoRGBPixelAlphaMMX; 1396 #endif 1397 } 1398 #endif /* __MMX__ || __3dNOW__ */ 1399 if (sf->Amask == 0xff000000) { 1400 #if SDL_ARM_NEON_BLITTERS 1401 if (SDL_HasNEON()) 1402 return BlitRGBtoRGBPixelAlphaARMNEON; 1403 #endif 1404 #if SDL_ARM_SIMD_BLITTERS 1405 if (SDL_HasARMSIMD()) 1406 return BlitRGBtoRGBPixelAlphaARMSIMD; 1407 #endif 1408 return BlitRGBtoRGBPixelAlpha; 1409 } 1410 } 1411 return BlitNtoNPixelAlpha; 1412 1413 case 3: 1414 default: 1415 break; 1416 } 1417 return BlitNtoNPixelAlpha; 1418 1419 case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: 1420 if (sf->Amask == 0) { 1421 /* Per-surface alpha blits */ 1422 switch (df->BytesPerPixel) { 1423 case 1: 1424 if (df->palette != NULL) { 1425 return BlitNto1SurfaceAlpha; 1426 } else { 1427 /* RGB332 has no palette ! */ 1428 return BlitNtoNSurfaceAlpha; 1429 } 1430 1431 case 2: 1432 if (surface->map->identity) { 1433 if (df->Gmask == 0x7e0) { 1434 #ifdef __MMX__ 1435 if (SDL_HasMMX()) 1436 return Blit565to565SurfaceAlphaMMX; 1437 else 1438 #endif 1439 return Blit565to565SurfaceAlpha; 1440 } else if (df->Gmask == 0x3e0) { 1441 #ifdef __MMX__ 1442 if (SDL_HasMMX()) 1443 return Blit555to555SurfaceAlphaMMX; 1444 else 1445 #endif 1446 return Blit555to555SurfaceAlpha; 1447 } 1448 } 1449 return BlitNtoNSurfaceAlpha; 1450 1451 case 4: 1452 if (sf->Rmask == df->Rmask 1453 && sf->Gmask == df->Gmask 1454 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { 1455 #ifdef __MMX__ 1456 if (sf->Rshift % 8 == 0 1457 && sf->Gshift % 8 == 0 1458 && sf->Bshift % 8 == 0 && SDL_HasMMX()) 1459 return BlitRGBtoRGBSurfaceAlphaMMX; 1460 #endif 1461 if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) { 1462 return BlitRGBtoRGBSurfaceAlpha; 1463 } 1464 } 1465 return BlitNtoNSurfaceAlpha; 1466 1467 case 3: 1468 default: 1469 return BlitNtoNSurfaceAlpha; 1470 } 1471 } 1472 break; 1473 1474 case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: 1475 if (sf->Amask == 0) { 1476 if (df->BytesPerPixel == 1) { 1477 1478 if (df->palette != NULL) { 1479 return BlitNto1SurfaceAlphaKey; 1480 } else { 1481 /* RGB332 has no palette ! */ 1482 return BlitNtoNSurfaceAlphaKey; 1483 } 1484 } else { 1485 return BlitNtoNSurfaceAlphaKey; 1486 } 1487 } 1488 break; 1489 } 1490 1491 return NULL; 1492 } 1493 1494 #endif /* SDL_HAVE_BLIT_A */ 1495 1496 /* vi: set ts=4 sw=4 expandtab: */