SDL_blit_A.c - sdl - FORK: Simple Directmedia Layer

SDL_blit_A.c (51115B)
      1 /*
      2   Simple DirectMedia Layer
      3   Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org>
      4 
      5   This software is provided 'as-is', without any express or implied
      6   warranty.  In no event will the authors be held liable for any damages
      7   arising from the use of this software.
      8 
      9   Permission is granted to anyone to use this software for any purpose,
     10   including commercial applications, and to alter it and redistribute it
     11   freely, subject to the following restrictions:
     12 
     13   1. The origin of this software must not be misrepresented; you must not
     14      claim that you wrote the original software. If you use this software
     15      in a product, an acknowledgment in the product documentation would be
     16      appreciated but is not required.
     17   2. Altered source versions must be plainly marked as such, and must not be
     18      misrepresented as being the original software.
     19   3. This notice may not be removed or altered from any source distribution.
     20 */
     21 #include "../SDL_internal.h"
     22 
     23 #if SDL_HAVE_BLIT_A
     24 
     25 #include "SDL_video.h"
     26 #include "SDL_blit.h"
     27 
     28 /* Functions to perform alpha blended blitting */
     29 
     30 /* N->1 blending with per-surface alpha */
     31 static void
     32 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
     33 {
     34     int width = info->dst_w;
     35     int height = info->dst_h;
     36     Uint8 *src = info->src;
     37     int srcskip = info->src_skip;
     38     Uint8 *dst = info->dst;
     39     int dstskip = info->dst_skip;
     40     Uint8 *palmap = info->table;
     41     SDL_PixelFormat *srcfmt = info->src_fmt;
     42     SDL_PixelFormat *dstfmt = info->dst_fmt;
     43     int srcbpp = srcfmt->BytesPerPixel;
     44     Uint32 Pixel;
     45     unsigned sR, sG, sB;
     46     unsigned dR, dG, dB;
     47     const unsigned A = info->a;
     48 
     49     while (height--) {
     50         /* *INDENT-OFF* */
     51         DUFFS_LOOP4(
     52         {
     53         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
     54         dR = dstfmt->palette->colors[*dst].r;
     55         dG = dstfmt->palette->colors[*dst].g;
     56         dB = dstfmt->palette->colors[*dst].b;
     57         ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
     58         dR &= 0xff;
     59         dG &= 0xff;
     60         dB &= 0xff;
     61         /* Pack RGB into 8bit pixel */
     62         if ( palmap == NULL ) {
     63             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
     64         } else {
     65             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
     66         }
     67         dst++;
     68         src += srcbpp;
     69         },
     70         width);
     71         /* *INDENT-ON* */
     72         src += srcskip;
     73         dst += dstskip;
     74     }
     75 }
     76 
     77 /* N->1 blending with pixel alpha */
     78 static void
     79 BlitNto1PixelAlpha(SDL_BlitInfo * info)
     80 {
     81     int width = info->dst_w;
     82     int height = info->dst_h;
     83     Uint8 *src = info->src;
     84     int srcskip = info->src_skip;
     85     Uint8 *dst = info->dst;
     86     int dstskip = info->dst_skip;
     87     Uint8 *palmap = info->table;
     88     SDL_PixelFormat *srcfmt = info->src_fmt;
     89     SDL_PixelFormat *dstfmt = info->dst_fmt;
     90     int srcbpp = srcfmt->BytesPerPixel;
     91     Uint32 Pixel;
     92     unsigned sR, sG, sB, sA;
     93     unsigned dR, dG, dB;
     94 
     95     while (height--) {
     96         /* *INDENT-OFF* */
     97         DUFFS_LOOP4(
     98         {
     99         DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
    100         dR = dstfmt->palette->colors[*dst].r;
    101         dG = dstfmt->palette->colors[*dst].g;
    102         dB = dstfmt->palette->colors[*dst].b;
    103         ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
    104         dR &= 0xff;
    105         dG &= 0xff;
    106         dB &= 0xff;
    107         /* Pack RGB into 8bit pixel */
    108         if ( palmap == NULL ) {
    109             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
    110         } else {
    111             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
    112         }
    113         dst++;
    114         src += srcbpp;
    115         },
    116         width);
    117         /* *INDENT-ON* */
    118         src += srcskip;
    119         dst += dstskip;
    120     }
    121 }
    122 
    123 /* colorkeyed N->1 blending with per-surface alpha */
    124 static void
    125 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
    126 {
    127     int width = info->dst_w;
    128     int height = info->dst_h;
    129     Uint8 *src = info->src;
    130     int srcskip = info->src_skip;
    131     Uint8 *dst = info->dst;
    132     int dstskip = info->dst_skip;
    133     Uint8 *palmap = info->table;
    134     SDL_PixelFormat *srcfmt = info->src_fmt;
    135     SDL_PixelFormat *dstfmt = info->dst_fmt;
    136     int srcbpp = srcfmt->BytesPerPixel;
    137     Uint32 ckey = info->colorkey;
    138     Uint32 Pixel;
    139     unsigned sR, sG, sB;
    140     unsigned dR, dG, dB;
    141     const unsigned A = info->a;
    142 
    143     while (height--) {
    144         /* *INDENT-OFF* */
    145         DUFFS_LOOP(
    146         {
    147         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    148         if ( Pixel != ckey ) {
    149             dR = dstfmt->palette->colors[*dst].r;
    150             dG = dstfmt->palette->colors[*dst].g;
    151             dB = dstfmt->palette->colors[*dst].b;
    152             ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
    153             dR &= 0xff;
    154             dG &= 0xff;
    155             dB &= 0xff;
    156             /* Pack RGB into 8bit pixel */
    157             if ( palmap == NULL ) {
    158                 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
    159             } else {
    160                 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
    161             }
    162         }
    163         dst++;
    164         src += srcbpp;
    165         },
    166         width);
    167         /* *INDENT-ON* */
    168         src += srcskip;
    169         dst += dstskip;
    170     }
    171 }
    172 
    173 #ifdef __MMX__
    174 
    175 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    176 static void
    177 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
    178 {
    179     int width = info->dst_w;
    180     int height = info->dst_h;
    181     Uint32 *srcp = (Uint32 *) info->src;
    182     int srcskip = info->src_skip >> 2;
    183     Uint32 *dstp = (Uint32 *) info->dst;
    184     int dstskip = info->dst_skip >> 2;
    185     Uint32 dalpha = info->dst_fmt->Amask;
    186 
    187     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
    188 
    189     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
    190     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
    191     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
    192 
    193     while (height--) {
    194         int n = width;
    195         if (n & 1) {
    196             Uint32 s = *srcp++;
    197             Uint32 d = *dstp;
    198             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    199                        + (s & d & 0x00010101)) | dalpha;
    200             n--;
    201         }
    202 
    203         for (n >>= 1; n > 0; --n) {
    204             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
    205             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
    206 
    207             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
    208             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
    209 
    210             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
    211             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
    212             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
    213             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
    214 
    215             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
    216             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
    217             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
    218             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
    219 
    220             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
    221             dstp += 2;
    222             srcp += 2;
    223         }
    224 
    225         srcp += srcskip;
    226         dstp += dstskip;
    227     }
    228     _mm_empty();
    229 }
    230 
    231 /* fast RGB888->(A)RGB888 blending with surface alpha */
    232 static void
    233 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
    234 {
    235     SDL_PixelFormat *df = info->dst_fmt;
    236     Uint32 chanmask;
    237     unsigned alpha = info->a;
    238 
    239     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
    240         /* only call a128 version when R,G,B occupy lower bits */
    241         BlitRGBtoRGBSurfaceAlpha128MMX(info);
    242     } else {
    243         int width = info->dst_w;
    244         int height = info->dst_h;
    245         Uint32 *srcp = (Uint32 *) info->src;
    246         int srcskip = info->src_skip >> 2;
    247         Uint32 *dstp = (Uint32 *) info->dst;
    248         int dstskip = info->dst_skip >> 2;
    249         Uint32 dalpha = df->Amask;
    250         Uint32 amult;
    251 
    252         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
    253 
    254         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
    255         /* form the alpha mult */
    256         amult = alpha | (alpha << 8);
    257         amult = amult | (amult << 16);
    258         chanmask =
    259             (0xff << df->Rshift) | (0xff << df->
    260                                     Gshift) | (0xff << df->Bshift);
    261         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
    262         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
    263         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
    264         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
    265 
    266         while (height--) {
    267             int n = width;
    268             if (n & 1) {
    269                 /* One Pixel Blend */
    270                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
    271                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
    272 
    273                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
    274                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    275 
    276                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
    277                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
    278                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
    279                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
    280 
    281                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
    282                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
    283                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    284 
    285                 ++srcp;
    286                 ++dstp;
    287 
    288                 n--;
    289             }
    290 
    291             for (n >>= 1; n > 0; --n) {
    292                 /* Two Pixels Blend */
    293                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
    294                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
    295                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
    296                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
    297 
    298                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
    299                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
    300                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
    301                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
    302 
    303                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
    304                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
    305                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
    306                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
    307 
    308                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
    309                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
    310                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
    311                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
    312 
    313                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
    314                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
    315 
    316                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
    317 
    318                 srcp += 2;
    319                 dstp += 2;
    320             }
    321             srcp += srcskip;
    322             dstp += dstskip;
    323         }
    324         _mm_empty();
    325     }
    326 }
    327 
    328 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
    329 static void
    330 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
    331 {
    332     int width = info->dst_w;
    333     int height = info->dst_h;
    334     Uint32 *srcp = (Uint32 *) info->src;
    335     int srcskip = info->src_skip >> 2;
    336     Uint32 *dstp = (Uint32 *) info->dst;
    337     int dstskip = info->dst_skip >> 2;
    338     SDL_PixelFormat *sf = info->src_fmt;
    339     Uint32 amask = sf->Amask;
    340     Uint32 ashift = sf->Ashift;
    341     Uint64 multmask, multmask2;
    342 
    343     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
    344 
    345     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
    346     multmask = 0x00FF;
    347     multmask <<= (ashift * 2);
    348     multmask2 = 0x00FF00FF00FF00FFULL;
    349 
    350     while (height--) {
    351         /* *INDENT-OFF* */
    352         DUFFS_LOOP4({
    353         Uint32 alpha = *srcp & amask;
    354         if (alpha == 0) {
    355             /* do nothing */
    356         } else if (alpha == amask) {
    357             *dstp = *srcp;
    358         } else {
    359             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
    360             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
    361 
    362             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
    363             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    364 
    365             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
    366             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
    367             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
    368             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
    369             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
    370             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
    371 
    372             /* blend */            
    373             src1 = _mm_mullo_pi16(src1, mm_alpha);
    374             src1 = _mm_srli_pi16(src1, 8);
    375             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
    376             dst1 = _mm_srli_pi16(dst1, 8);
    377             dst1 = _mm_add_pi16(src1, dst1);
    378             dst1 = _mm_packs_pu16(dst1, mm_zero);
    379             
    380             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    381         }
    382         ++srcp;
    383         ++dstp;
    384         }, width);
    385         /* *INDENT-ON* */
    386         srcp += srcskip;
    387         dstp += dstskip;
    388     }
    389     _mm_empty();
    390 }
    391 
    392 #endif /* __MMX__ */
    393 
    394 #if SDL_ARM_SIMD_BLITTERS
    395 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
    396 
    397 static void
    398 BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
    399 {
    400 	int32_t width = info->dst_w;
    401 	int32_t height = info->dst_h;
    402 	uint16_t *dstp = (uint16_t *)info->dst;
    403 	int32_t dststride = width + (info->dst_skip >> 1);
    404 	uint32_t *srcp = (uint32_t *)info->src;
    405 	int32_t srcstride = width + (info->src_skip >> 2);
    406 
    407 	BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
    408 }
    409 
    410 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
    411 
    412 static void
    413 BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
    414 {
    415     int32_t width = info->dst_w;
    416     int32_t height = info->dst_h;
    417     uint32_t *dstp = (uint32_t *)info->dst;
    418     int32_t dststride = width + (info->dst_skip >> 2);
    419     uint32_t *srcp = (uint32_t *)info->src;
    420     int32_t srcstride = width + (info->src_skip >> 2);
    421 
    422     BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
    423 }
    424 #endif
    425 
    426 #if SDL_ARM_NEON_BLITTERS
    427 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
    428 
    429 static void
    430 BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
    431 {
    432     int32_t width = info->dst_w;
    433     int32_t height = info->dst_h;
    434     uint16_t *dstp = (uint16_t *)info->dst;
    435     int32_t dststride = width + (info->dst_skip >> 1);
    436     uint32_t *srcp = (uint32_t *)info->src;
    437     int32_t srcstride = width + (info->src_skip >> 2);
    438 
    439     BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
    440 }
    441 
    442 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
    443 
    444 static void
    445 BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
    446 {
    447 	int32_t width = info->dst_w;
    448 	int32_t height = info->dst_h;
    449 	uint32_t *dstp = (uint32_t *)info->dst;
    450 	int32_t dststride = width + (info->dst_skip >> 2);
    451 	uint32_t *srcp = (uint32_t *)info->src;
    452 	int32_t srcstride = width + (info->src_skip >> 2);
    453 
    454 	BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
    455 }
    456 #endif
    457 
    458 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    459 static void
    460 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
    461 {
    462     int width = info->dst_w;
    463     int height = info->dst_h;
    464     Uint32 *srcp = (Uint32 *) info->src;
    465     int srcskip = info->src_skip >> 2;
    466     Uint32 *dstp = (Uint32 *) info->dst;
    467     int dstskip = info->dst_skip >> 2;
    468 
    469     while (height--) {
    470         /* *INDENT-OFF* */
    471         DUFFS_LOOP4({
    472             Uint32 s = *srcp++;
    473             Uint32 d = *dstp;
    474             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    475                    + (s & d & 0x00010101)) | 0xff000000;
    476         }, width);
    477         /* *INDENT-ON* */
    478         srcp += srcskip;
    479         dstp += dstskip;
    480     }
    481 }
    482 
    483 /* fast RGB888->(A)RGB888 blending with surface alpha */
    484 static void
    485 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
    486 {
    487     unsigned alpha = info->a;
    488     if (alpha == 128) {
    489         BlitRGBtoRGBSurfaceAlpha128(info);
    490     } else {
    491         int width = info->dst_w;
    492         int height = info->dst_h;
    493         Uint32 *srcp = (Uint32 *) info->src;
    494         int srcskip = info->src_skip >> 2;
    495         Uint32 *dstp = (Uint32 *) info->dst;
    496         int dstskip = info->dst_skip >> 2;
    497         Uint32 s;
    498         Uint32 d;
    499         Uint32 s1;
    500         Uint32 d1;
    501 
    502         while (height--) {
    503             /* *INDENT-OFF* */
    504             DUFFS_LOOP4({
    505                 s = *srcp;
    506                 d = *dstp;
    507                 s1 = s & 0xff00ff;
    508                 d1 = d & 0xff00ff;
    509                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
    510                      & 0xff00ff;
    511                 s &= 0xff00;
    512                 d &= 0xff00;
    513                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
    514                 *dstp = d1 | d | 0xff000000;
    515                 ++srcp;
    516                 ++dstp;
    517             }, width);
    518             /* *INDENT-ON* */
    519             srcp += srcskip;
    520             dstp += dstskip;
    521         }
    522     }
    523 }
    524 
    525 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
    526 static void
    527 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
    528 {
    529     int width = info->dst_w;
    530     int height = info->dst_h;
    531     Uint32 *srcp = (Uint32 *) info->src;
    532     int srcskip = info->src_skip >> 2;
    533     Uint32 *dstp = (Uint32 *) info->dst;
    534     int dstskip = info->dst_skip >> 2;
    535 
    536     while (height--) {
    537         /* *INDENT-OFF* */
    538         DUFFS_LOOP4({
    539         Uint32 dalpha;
    540         Uint32 d;
    541         Uint32 s1;
    542         Uint32 d1;
    543         Uint32 s = *srcp;
    544         Uint32 alpha = s >> 24;
    545         /* FIXME: Here we special-case opaque alpha since the
    546            compositioning used (>>8 instead of /255) doesn't handle
    547            it correctly. Also special-case alpha=0 for speed?
    548            Benchmark this! */
    549         if (alpha) {
    550           if (alpha == SDL_ALPHA_OPAQUE) {
    551               *dstp = *srcp;
    552           } else {
    553             /*
    554              * take out the middle component (green), and process
    555              * the other two in parallel. One multiply less.
    556              */
    557             d = *dstp;
    558             dalpha = d >> 24;
    559             s1 = s & 0xff00ff;
    560             d1 = d & 0xff00ff;
    561             d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
    562             s &= 0xff00;
    563             d &= 0xff00;
    564             d = (d + ((s - d) * alpha >> 8)) & 0xff00;
    565             dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
    566             *dstp = d1 | d | (dalpha << 24);
    567           }
    568         }
    569         ++srcp;
    570         ++dstp;
    571         }, width);
    572         /* *INDENT-ON* */
    573         srcp += srcskip;
    574         dstp += dstskip;
    575     }
    576 }
    577 
    578 #ifdef __3dNOW__
    579 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
    580 static void
    581 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
    582 {
    583     int width = info->dst_w;
    584     int height = info->dst_h;
    585     Uint32 *srcp = (Uint32 *) info->src;
    586     int srcskip = info->src_skip >> 2;
    587     Uint32 *dstp = (Uint32 *) info->dst;
    588     int dstskip = info->dst_skip >> 2;
    589     SDL_PixelFormat *sf = info->src_fmt;
    590     Uint32 amask = sf->Amask;
    591     Uint32 ashift = sf->Ashift;
    592     Uint64 multmask, multmask2;
    593 
    594     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
    595 
    596     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
    597     multmask = 0x00FF;
    598     multmask <<= (ashift * 2);
    599     multmask2 = 0x00FF00FF00FF00FFULL;
    600 
    601     while (height--) {
    602         /* *INDENT-OFF* */
    603         DUFFS_LOOP4({
    604         Uint32 alpha;
    605 
    606         _m_prefetch(srcp + 16);
    607         _m_prefetch(dstp + 16);
    608 
    609         alpha = *srcp & amask;
    610         if (alpha == 0) {
    611             /* do nothing */
    612         } else if (alpha == amask) {
    613             *dstp = *srcp;
    614         } else {
    615             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
    616             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
    617 
    618             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
    619             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
    620 
    621             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
    622             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
    623             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
    624             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
    625             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
    626             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
    627 
    628 
    629             /* blend */            
    630             src1 = _mm_mullo_pi16(src1, mm_alpha);
    631             src1 = _mm_srli_pi16(src1, 8);
    632             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
    633             dst1 = _mm_srli_pi16(dst1, 8);
    634             dst1 = _mm_add_pi16(src1, dst1);
    635             dst1 = _mm_packs_pu16(dst1, mm_zero);
    636             
    637             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    638         }
    639         ++srcp;
    640         ++dstp;
    641         }, width);
    642         /* *INDENT-ON* */
    643         srcp += srcskip;
    644         dstp += dstskip;
    645     }
    646     _mm_empty();
    647 }
    648 
    649 #endif /* __3dNOW__ */
    650 
    651 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
    652 
    653 /* blend a single 16 bit pixel at 50% */
    654 #define BLEND16_50(d, s, mask)                        \
    655     ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
    656 
    657 /* blend two 16 bit pixels at 50% */
    658 #define BLEND2x16_50(d, s, mask)                         \
    659     (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
    660      + (s & d & (~(mask | mask << 16))))
    661 
    662 static void
    663 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
    664 {
    665     int width = info->dst_w;
    666     int height = info->dst_h;
    667     Uint16 *srcp = (Uint16 *) info->src;
    668     int srcskip = info->src_skip >> 1;
    669     Uint16 *dstp = (Uint16 *) info->dst;
    670     int dstskip = info->dst_skip >> 1;
    671 
    672     while (height--) {
    673         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
    674             /*
    675              * Source and destination not aligned, pipeline it.
    676              * This is mostly a win for big blits but no loss for
    677              * small ones
    678              */
    679             Uint32 prev_sw;
    680             int w = width;
    681 
    682             /* handle odd destination */
    683             if ((uintptr_t) dstp & 2) {
    684                 Uint16 d = *dstp, s = *srcp;
    685                 *dstp = BLEND16_50(d, s, mask);
    686                 dstp++;
    687                 srcp++;
    688                 w--;
    689             }
    690             srcp++;             /* srcp is now 32-bit aligned */
    691 
    692             /* bootstrap pipeline with first halfword */
    693             prev_sw = ((Uint32 *) srcp)[-1];
    694 
    695             while (w > 1) {
    696                 Uint32 sw, dw, s;
    697                 sw = *(Uint32 *) srcp;
    698                 dw = *(Uint32 *) dstp;
    699 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
    700                 s = (prev_sw << 16) + (sw >> 16);
    701 #else
    702                 s = (prev_sw >> 16) + (sw << 16);
    703 #endif
    704                 prev_sw = sw;
    705                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
    706                 dstp += 2;
    707                 srcp += 2;
    708                 w -= 2;
    709             }
    710 
    711             /* final pixel if any */
    712             if (w) {
    713                 Uint16 d = *dstp, s;
    714 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
    715                 s = (Uint16) prev_sw;
    716 #else
    717                 s = (Uint16) (prev_sw >> 16);
    718 #endif
    719                 *dstp = BLEND16_50(d, s, mask);
    720                 srcp++;
    721                 dstp++;
    722             }
    723             srcp += srcskip - 1;
    724             dstp += dstskip;
    725         } else {
    726             /* source and destination are aligned */
    727             int w = width;
    728 
    729             /* first odd pixel? */
    730             if ((uintptr_t) srcp & 2) {
    731                 Uint16 d = *dstp, s = *srcp;
    732                 *dstp = BLEND16_50(d, s, mask);
    733                 srcp++;
    734                 dstp++;
    735                 w--;
    736             }
    737             /* srcp and dstp are now 32-bit aligned */
    738 
    739             while (w > 1) {
    740                 Uint32 sw = *(Uint32 *) srcp;
    741                 Uint32 dw = *(Uint32 *) dstp;
    742                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
    743                 srcp += 2;
    744                 dstp += 2;
    745                 w -= 2;
    746             }
    747 
    748             /* last odd pixel? */
    749             if (w) {
    750                 Uint16 d = *dstp, s = *srcp;
    751                 *dstp = BLEND16_50(d, s, mask);
    752                 srcp++;
    753                 dstp++;
    754             }
    755             srcp += srcskip;
    756             dstp += dstskip;
    757         }
    758     }
    759 }
    760 
    761 #ifdef __MMX__
    762 
    763 /* fast RGB565->RGB565 blending with surface alpha */
    764 static void
    765 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
    766 {
    767     unsigned alpha = info->a;
    768     if (alpha == 128) {
    769         Blit16to16SurfaceAlpha128(info, 0xf7de);
    770     } else {
    771         int width = info->dst_w;
    772         int height = info->dst_h;
    773         Uint16 *srcp = (Uint16 *) info->src;
    774         int srcskip = info->src_skip >> 1;
    775         Uint16 *dstp = (Uint16 *) info->dst;
    776         int dstskip = info->dst_skip >> 1;
    777         Uint32 s, d;
    778 
    779         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
    780 
    781         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
    782         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
    783         alpha >>= 3;            /* downscale alpha to 5 bits */
    784 
    785         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
    786         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
    787         /* position alpha to allow for mullo and mulhi on diff channels
    788            to reduce the number of operations */
    789         mm_alpha = _mm_slli_si64(mm_alpha, 3);
    790 
    791         /* Setup the 565 color channel masks */
    792         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
    793         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
    794 
    795         while (height--) {
    796             /* *INDENT-OFF* */
    797             DUFFS_LOOP_124(
    798             {
    799                 s = *srcp++;
    800                 d = *dstp;
    801                 /*
    802                  * shift out the middle component (green) to
    803                  * the high 16 bits, and process all three RGB
    804                  * components at the same time.
    805                  */
    806                 s = (s | s << 16) & 0x07e0f81f;
    807                 d = (d | d << 16) & 0x07e0f81f;
    808                 d += (s - d) * alpha >> 5;
    809                 d &= 0x07e0f81f;
    810                 *dstp++ = (Uint16)(d | d >> 16);
    811             },{
    812                 s = *srcp++;
    813                 d = *dstp;
    814                 /*
    815                  * shift out the middle component (green) to
    816                  * the high 16 bits, and process all three RGB
    817                  * components at the same time.
    818                  */
    819                 s = (s | s << 16) & 0x07e0f81f;
    820                 d = (d | d << 16) & 0x07e0f81f;
    821                 d += (s - d) * alpha >> 5;
    822                 d &= 0x07e0f81f;
    823                 *dstp++ = (Uint16)(d | d >> 16);
    824                 s = *srcp++;
    825                 d = *dstp;
    826                 /*
    827                  * shift out the middle component (green) to
    828                  * the high 16 bits, and process all three RGB
    829                  * components at the same time.
    830                  */
    831                 s = (s | s << 16) & 0x07e0f81f;
    832                 d = (d | d << 16) & 0x07e0f81f;
    833                 d += (s - d) * alpha >> 5;
    834                 d &= 0x07e0f81f;
    835                 *dstp++ = (Uint16)(d | d >> 16);
    836             },{
    837                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
    838                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
    839 
    840                 /* red */
    841                 src2 = src1;
    842                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
    843 
    844                 dst2 = dst1;
    845                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
    846 
    847                 /* blend */
    848                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    849                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    850                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
    851                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    852                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
    853 
    854                 mm_res = dst2; /* RED -> mm_res */
    855 
    856                 /* green -- process the bits in place */
    857                 src2 = src1;
    858                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
    859 
    860                 dst2 = dst1;
    861                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
    862 
    863                 /* blend */
    864                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    865                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    866                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
    867                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    868 
    869                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
    870 
    871                 /* blue */
    872                 src2 = src1;
    873                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
    874 
    875                 dst2 = dst1;
    876                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
    877 
    878                 /* blend */
    879                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    880                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    881                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
    882                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    883                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
    884 
    885                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
    886 
    887                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
    888 
    889                 srcp += 4;
    890                 dstp += 4;
    891             }, width);
    892             /* *INDENT-ON* */
    893             srcp += srcskip;
    894             dstp += dstskip;
    895         }
    896         _mm_empty();
    897     }
    898 }
    899 
    900 /* fast RGB555->RGB555 blending with surface alpha */
    901 static void
    902 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
    903 {
    904     unsigned alpha = info->a;
    905     if (alpha == 128) {
    906         Blit16to16SurfaceAlpha128(info, 0xfbde);
    907     } else {
    908         int width = info->dst_w;
    909         int height = info->dst_h;
    910         Uint16 *srcp = (Uint16 *) info->src;
    911         int srcskip = info->src_skip >> 1;
    912         Uint16 *dstp = (Uint16 *) info->dst;
    913         int dstskip = info->dst_skip >> 1;
    914         Uint32 s, d;
    915 
    916         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
    917 
    918         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
    919         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
    920         alpha >>= 3;            /* downscale alpha to 5 bits */
    921 
    922         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
    923         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
    924         /* position alpha to allow for mullo and mulhi on diff channels
    925            to reduce the number of operations */
    926         mm_alpha = _mm_slli_si64(mm_alpha, 3);
    927 
    928         /* Setup the 555 color channel masks */
    929         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
    930         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
    931         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
    932 
    933         while (height--) {
    934             /* *INDENT-OFF* */
    935             DUFFS_LOOP_124(
    936             {
    937                 s = *srcp++;
    938                 d = *dstp;
    939                 /*
    940                  * shift out the middle component (green) to
    941                  * the high 16 bits, and process all three RGB
    942                  * components at the same time.
    943                  */
    944                 s = (s | s << 16) & 0x03e07c1f;
    945                 d = (d | d << 16) & 0x03e07c1f;
    946                 d += (s - d) * alpha >> 5;
    947                 d &= 0x03e07c1f;
    948                 *dstp++ = (Uint16)(d | d >> 16);
    949             },{
    950                 s = *srcp++;
    951                 d = *dstp;
    952                 /*
    953                  * shift out the middle component (green) to
    954                  * the high 16 bits, and process all three RGB
    955                  * components at the same time.
    956                  */
    957                 s = (s | s << 16) & 0x03e07c1f;
    958                 d = (d | d << 16) & 0x03e07c1f;
    959                 d += (s - d) * alpha >> 5;
    960                 d &= 0x03e07c1f;
    961                 *dstp++ = (Uint16)(d | d >> 16);
    962                     s = *srcp++;
    963                 d = *dstp;
    964                 /*
    965                  * shift out the middle component (green) to
    966                  * the high 16 bits, and process all three RGB
    967                  * components at the same time.
    968                  */
    969                 s = (s | s << 16) & 0x03e07c1f;
    970                 d = (d | d << 16) & 0x03e07c1f;
    971                 d += (s - d) * alpha >> 5;
    972                 d &= 0x03e07c1f;
    973                 *dstp++ = (Uint16)(d | d >> 16);
    974             },{
    975                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
    976                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
    977 
    978                 /* red -- process the bits in place */
    979                 src2 = src1;
    980                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
    981 
    982                 dst2 = dst1;
    983                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
    984 
    985                 /* blend */
    986                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
    987                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
    988                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
    989                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
    990                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
    991 
    992                 mm_res = dst2; /* RED -> mm_res */
    993                 
    994                 /* green -- process the bits in place */
    995                 src2 = src1;
    996                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
    997 
    998                 dst2 = dst1;
    999                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   1000 
   1001                 /* blend */
   1002                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   1003                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   1004                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   1005                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   1006 
   1007                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   1008 
   1009                 /* blue */
   1010                 src2 = src1; /* src -> src2 */
   1011                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   1012 
   1013                 dst2 = dst1; /* dst -> dst2 */
   1014                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   1015 
   1016                 /* blend */
   1017                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   1018                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   1019                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   1020                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   1021                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   1022 
   1023                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   1024 
   1025                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   1026 
   1027                 srcp += 4;
   1028                 dstp += 4;
   1029             }, width);
   1030             /* *INDENT-ON* */
   1031             srcp += srcskip;
   1032             dstp += dstskip;
   1033         }
   1034         _mm_empty();
   1035     }
   1036 }
   1037 
   1038 #endif /* __MMX__ */
   1039 
   1040 /* fast RGB565->RGB565 blending with surface alpha */
   1041 static void
   1042 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
   1043 {
   1044     unsigned alpha = info->a;
   1045     if (alpha == 128) {
   1046         Blit16to16SurfaceAlpha128(info, 0xf7de);
   1047     } else {
   1048         int width = info->dst_w;
   1049         int height = info->dst_h;
   1050         Uint16 *srcp = (Uint16 *) info->src;
   1051         int srcskip = info->src_skip >> 1;
   1052         Uint16 *dstp = (Uint16 *) info->dst;
   1053         int dstskip = info->dst_skip >> 1;
   1054         alpha >>= 3;            /* downscale alpha to 5 bits */
   1055 
   1056         while (height--) {
   1057             /* *INDENT-OFF* */
   1058             DUFFS_LOOP4({
   1059                 Uint32 s = *srcp++;
   1060                 Uint32 d = *dstp;
   1061                 /*
   1062                  * shift out the middle component (green) to
   1063                  * the high 16 bits, and process all three RGB
   1064                  * components at the same time.
   1065                  */
   1066                 s = (s | s << 16) & 0x07e0f81f;
   1067                 d = (d | d << 16) & 0x07e0f81f;
   1068                 d += (s - d) * alpha >> 5;
   1069                 d &= 0x07e0f81f;
   1070                 *dstp++ = (Uint16)(d | d >> 16);
   1071             }, width);
   1072             /* *INDENT-ON* */
   1073             srcp += srcskip;
   1074             dstp += dstskip;
   1075         }
   1076     }
   1077 }
   1078 
   1079 /* fast RGB555->RGB555 blending with surface alpha */
   1080 static void
   1081 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
   1082 {
   1083     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
   1084     if (alpha == 128) {
   1085         Blit16to16SurfaceAlpha128(info, 0xfbde);
   1086     } else {
   1087         int width = info->dst_w;
   1088         int height = info->dst_h;
   1089         Uint16 *srcp = (Uint16 *) info->src;
   1090         int srcskip = info->src_skip >> 1;
   1091         Uint16 *dstp = (Uint16 *) info->dst;
   1092         int dstskip = info->dst_skip >> 1;
   1093         alpha >>= 3;            /* downscale alpha to 5 bits */
   1094 
   1095         while (height--) {
   1096             /* *INDENT-OFF* */
   1097             DUFFS_LOOP4({
   1098                 Uint32 s = *srcp++;
   1099                 Uint32 d = *dstp;
   1100                 /*
   1101                  * shift out the middle component (green) to
   1102                  * the high 16 bits, and process all three RGB
   1103                  * components at the same time.
   1104                  */
   1105                 s = (s | s << 16) & 0x03e07c1f;
   1106                 d = (d | d << 16) & 0x03e07c1f;
   1107                 d += (s - d) * alpha >> 5;
   1108                 d &= 0x03e07c1f;
   1109                 *dstp++ = (Uint16)(d | d >> 16);
   1110             }, width);
   1111             /* *INDENT-ON* */
   1112             srcp += srcskip;
   1113             dstp += dstskip;
   1114         }
   1115     }
   1116 }
   1117 
   1118 /* fast ARGB8888->RGB565 blending with pixel alpha */
   1119 static void
   1120 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
   1121 {
   1122     int width = info->dst_w;
   1123     int height = info->dst_h;
   1124     Uint32 *srcp = (Uint32 *) info->src;
   1125     int srcskip = info->src_skip >> 2;
   1126     Uint16 *dstp = (Uint16 *) info->dst;
   1127     int dstskip = info->dst_skip >> 1;
   1128 
   1129     while (height--) {
   1130         /* *INDENT-OFF* */
   1131         DUFFS_LOOP4({
   1132         Uint32 s = *srcp;
   1133         unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
   1134         /* FIXME: Here we special-case opaque alpha since the
   1135            compositioning used (>>8 instead of /255) doesn't handle
   1136            it correctly. Also special-case alpha=0 for speed?
   1137            Benchmark this! */
   1138         if(alpha) {   
   1139           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
   1140             *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
   1141           } else {
   1142             Uint32 d = *dstp;
   1143             /*
   1144              * convert source and destination to G0RAB65565
   1145              * and blend all components at the same time
   1146              */
   1147             s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
   1148               + (s >> 3 & 0x1f);
   1149             d = (d | d << 16) & 0x07e0f81f;
   1150             d += (s - d) * alpha >> 5;
   1151             d &= 0x07e0f81f;
   1152             *dstp = (Uint16)(d | d >> 16);
   1153           }
   1154         }
   1155         srcp++;
   1156         dstp++;
   1157         }, width);
   1158         /* *INDENT-ON* */
   1159         srcp += srcskip;
   1160         dstp += dstskip;
   1161     }
   1162 }
   1163 
   1164 /* fast ARGB8888->RGB555 blending with pixel alpha */
   1165 static void
   1166 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
   1167 {
   1168     int width = info->dst_w;
   1169     int height = info->dst_h;
   1170     Uint32 *srcp = (Uint32 *) info->src;
   1171     int srcskip = info->src_skip >> 2;
   1172     Uint16 *dstp = (Uint16 *) info->dst;
   1173     int dstskip = info->dst_skip >> 1;
   1174 
   1175     while (height--) {
   1176         /* *INDENT-OFF* */
   1177         DUFFS_LOOP4({
   1178         unsigned alpha;
   1179         Uint32 s = *srcp;
   1180         alpha = s >> 27; /* downscale alpha to 5 bits */
   1181         /* FIXME: Here we special-case opaque alpha since the
   1182            compositioning used (>>8 instead of /255) doesn't handle
   1183            it correctly. Also special-case alpha=0 for speed?
   1184            Benchmark this! */
   1185         if(alpha) {   
   1186           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
   1187             *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
   1188           } else {
   1189             Uint32 d = *dstp;
   1190             /*
   1191              * convert source and destination to G0RAB65565
   1192              * and blend all components at the same time
   1193              */
   1194             s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
   1195               + (s >> 3 & 0x1f);
   1196             d = (d | d << 16) & 0x03e07c1f;
   1197             d += (s - d) * alpha >> 5;
   1198             d &= 0x03e07c1f;
   1199             *dstp = (Uint16)(d | d >> 16);
   1200           }
   1201         }
   1202         srcp++;
   1203         dstp++;
   1204         }, width);
   1205         /* *INDENT-ON* */
   1206         srcp += srcskip;
   1207         dstp += dstskip;
   1208     }
   1209 }
   1210 
   1211 /* General (slow) N->N blending with per-surface alpha */
   1212 static void
   1213 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
   1214 {
   1215     int width = info->dst_w;
   1216     int height = info->dst_h;
   1217     Uint8 *src = info->src;
   1218     int srcskip = info->src_skip;
   1219     Uint8 *dst = info->dst;
   1220     int dstskip = info->dst_skip;
   1221     SDL_PixelFormat *srcfmt = info->src_fmt;
   1222     SDL_PixelFormat *dstfmt = info->dst_fmt;
   1223     int srcbpp = srcfmt->BytesPerPixel;
   1224     int dstbpp = dstfmt->BytesPerPixel;
   1225     Uint32 Pixel;
   1226     unsigned sR, sG, sB;
   1227     unsigned dR, dG, dB, dA;
   1228     const unsigned sA = info->a;
   1229 
   1230     if (sA) {
   1231         while (height--) {
   1232         /* *INDENT-OFF* */
   1233         DUFFS_LOOP4(
   1234         {
   1235         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   1236         DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
   1237         ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
   1238         ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   1239         src += srcbpp;
   1240         dst += dstbpp;
   1241         },
   1242         width);
   1243         /* *INDENT-ON* */
   1244             src += srcskip;
   1245             dst += dstskip;
   1246         }
   1247     }
   1248 }
   1249 
   1250 /* General (slow) colorkeyed N->N blending with per-surface alpha */
   1251 static void
   1252 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
   1253 {
   1254     int width = info->dst_w;
   1255     int height = info->dst_h;
   1256     Uint8 *src = info->src;
   1257     int srcskip = info->src_skip;
   1258     Uint8 *dst = info->dst;
   1259     int dstskip = info->dst_skip;
   1260     SDL_PixelFormat *srcfmt = info->src_fmt;
   1261     SDL_PixelFormat *dstfmt = info->dst_fmt;
   1262     Uint32 ckey = info->colorkey;
   1263     int srcbpp = srcfmt->BytesPerPixel;
   1264     int dstbpp = dstfmt->BytesPerPixel;
   1265     Uint32 Pixel;
   1266     unsigned sR, sG, sB;
   1267     unsigned dR, dG, dB, dA;
   1268     const unsigned sA = info->a;
   1269 
   1270     while (height--) {
   1271         /* *INDENT-OFF* */
   1272         DUFFS_LOOP4(
   1273         {
   1274         RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
   1275         if(sA && Pixel != ckey) {
   1276             RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
   1277             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
   1278             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
   1279             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   1280         }
   1281         src += srcbpp;
   1282         dst += dstbpp;
   1283         },
   1284         width);
   1285         /* *INDENT-ON* */
   1286         src += srcskip;
   1287         dst += dstskip;
   1288     }
   1289 }
   1290 
   1291 /* General (slow) N->N blending with pixel alpha */
   1292 static void
   1293 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
   1294 {
   1295     int width = info->dst_w;
   1296     int height = info->dst_h;
   1297     Uint8 *src = info->src;
   1298     int srcskip = info->src_skip;
   1299     Uint8 *dst = info->dst;
   1300     int dstskip = info->dst_skip;
   1301     SDL_PixelFormat *srcfmt = info->src_fmt;
   1302     SDL_PixelFormat *dstfmt = info->dst_fmt;
   1303     int srcbpp;
   1304     int dstbpp;
   1305     Uint32 Pixel;
   1306     unsigned sR, sG, sB, sA;
   1307     unsigned dR, dG, dB, dA;
   1308 
   1309     /* Set up some basic variables */
   1310     srcbpp = srcfmt->BytesPerPixel;
   1311     dstbpp = dstfmt->BytesPerPixel;
   1312 
   1313     while (height--) {
   1314         /* *INDENT-OFF* */
   1315         DUFFS_LOOP4(
   1316         {
   1317         DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
   1318         if(sA) {
   1319             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
   1320             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
   1321             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
   1322         }
   1323         src += srcbpp;
   1324         dst += dstbpp;
   1325         },
   1326         width);
   1327         /* *INDENT-ON* */
   1328         src += srcskip;
   1329         dst += dstskip;
   1330     }
   1331 }
   1332 
   1333 
   1334 SDL_BlitFunc
   1335 SDL_CalculateBlitA(SDL_Surface * surface)
   1336 {
   1337     SDL_PixelFormat *sf = surface->format;
   1338     SDL_PixelFormat *df = surface->map->dst->format;
   1339 
   1340     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
   1341     case SDL_COPY_BLEND:
   1342         /* Per-pixel alpha blits */
   1343         switch (df->BytesPerPixel) {
   1344         case 1:
   1345             if (df->palette != NULL) {
   1346                 return BlitNto1PixelAlpha;
   1347             } else {
   1348                 /* RGB332 has no palette ! */
   1349                 return BlitNtoNPixelAlpha;
   1350             }
   1351 
   1352         case 2:
   1353 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
   1354                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
   1355                     && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
   1356                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
   1357                     || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
   1358                 {
   1359 #if SDL_ARM_NEON_BLITTERS
   1360                     if (SDL_HasNEON())
   1361                         return BlitARGBto565PixelAlphaARMNEON;
   1362 #endif
   1363 #if SDL_ARM_SIMD_BLITTERS
   1364                     if (SDL_HasARMSIMD())
   1365                         return BlitARGBto565PixelAlphaARMSIMD;
   1366 #endif
   1367                 }
   1368 #endif
   1369                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
   1370                     && sf->Gmask == 0xff00
   1371                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
   1372                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
   1373                 if (df->Gmask == 0x7e0)
   1374                     return BlitARGBto565PixelAlpha;
   1375                 else if (df->Gmask == 0x3e0)
   1376                     return BlitARGBto555PixelAlpha;
   1377             }
   1378             return BlitNtoNPixelAlpha;
   1379 
   1380         case 4:
   1381             if (sf->Rmask == df->Rmask
   1382                 && sf->Gmask == df->Gmask
   1383                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
   1384 #if defined(__MMX__) || defined(__3dNOW__)
   1385                 if (sf->Rshift % 8 == 0
   1386                     && sf->Gshift % 8 == 0
   1387                     && sf->Bshift % 8 == 0
   1388                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
   1389 #ifdef __3dNOW__
   1390                     if (SDL_Has3DNow())
   1391                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
   1392 #endif
   1393 #ifdef __MMX__
   1394                     if (SDL_HasMMX())
   1395                         return BlitRGBtoRGBPixelAlphaMMX;
   1396 #endif
   1397                 }
   1398 #endif /* __MMX__ || __3dNOW__ */
   1399                 if (sf->Amask == 0xff000000) {
   1400 #if SDL_ARM_NEON_BLITTERS
   1401                     if (SDL_HasNEON())
   1402                         return BlitRGBtoRGBPixelAlphaARMNEON;
   1403 #endif
   1404 #if SDL_ARM_SIMD_BLITTERS
   1405                     if (SDL_HasARMSIMD())
   1406                         return BlitRGBtoRGBPixelAlphaARMSIMD;
   1407 #endif
   1408                     return BlitRGBtoRGBPixelAlpha;
   1409                 }
   1410             }
   1411             return BlitNtoNPixelAlpha;
   1412 
   1413         case 3:
   1414         default:
   1415             break;
   1416         }
   1417         return BlitNtoNPixelAlpha;
   1418 
   1419     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
   1420         if (sf->Amask == 0) {
   1421             /* Per-surface alpha blits */
   1422             switch (df->BytesPerPixel) {
   1423             case 1:
   1424                 if (df->palette != NULL) {
   1425                     return BlitNto1SurfaceAlpha;
   1426                 } else {
   1427                     /* RGB332 has no palette ! */
   1428                     return BlitNtoNSurfaceAlpha;
   1429                 }
   1430 
   1431             case 2:
   1432                 if (surface->map->identity) {
   1433                     if (df->Gmask == 0x7e0) {
   1434 #ifdef __MMX__
   1435                         if (SDL_HasMMX())
   1436                             return Blit565to565SurfaceAlphaMMX;
   1437                         else
   1438 #endif
   1439                             return Blit565to565SurfaceAlpha;
   1440                     } else if (df->Gmask == 0x3e0) {
   1441 #ifdef __MMX__
   1442                         if (SDL_HasMMX())
   1443                             return Blit555to555SurfaceAlphaMMX;
   1444                         else
   1445 #endif
   1446                             return Blit555to555SurfaceAlpha;
   1447                     }
   1448                 }
   1449                 return BlitNtoNSurfaceAlpha;
   1450 
   1451             case 4:
   1452                 if (sf->Rmask == df->Rmask
   1453                     && sf->Gmask == df->Gmask
   1454                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
   1455 #ifdef __MMX__
   1456                     if (sf->Rshift % 8 == 0
   1457                         && sf->Gshift % 8 == 0
   1458                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
   1459                         return BlitRGBtoRGBSurfaceAlphaMMX;
   1460 #endif
   1461                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
   1462                         return BlitRGBtoRGBSurfaceAlpha;
   1463                     }
   1464                 }
   1465                 return BlitNtoNSurfaceAlpha;
   1466 
   1467             case 3:
   1468             default:
   1469                 return BlitNtoNSurfaceAlpha;
   1470             }
   1471         }
   1472         break;
   1473 
   1474     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
   1475         if (sf->Amask == 0) {
   1476             if (df->BytesPerPixel == 1) {
   1477 
   1478                 if (df->palette != NULL) {
   1479                     return BlitNto1SurfaceAlphaKey;
   1480                 } else {
   1481                     /* RGB332 has no palette ! */
   1482                     return BlitNtoNSurfaceAlphaKey;
   1483                 }
   1484             } else {
   1485                 return BlitNtoNSurfaceAlphaKey;
   1486             }
   1487         }
   1488         break;
   1489     }
   1490 
   1491     return NULL;
   1492 }
   1493 
   1494 #endif /* SDL_HAVE_BLIT_A */
   1495 
   1496 /* vi: set ts=4 sw=4 expandtab: */
	sdl FORK: Simple Directmedia Layer
	git clone https://git.neptards.moe/neptards/sdl.git
	Log \| Files \| Refs