sdl

FORK: Simple Directmedia Layer
git clone https://git.neptards.moe/neptards/sdl.git
Log | Files | Refs

pixman-arm-simd-asm.S (19392B)


      1 /*
      2  * Copyright (c) 2016 RISC OS Open Ltd
      3  *
      4  * This software is provided 'as-is', without any express or implied
      5  * warranty.  In no event will the authors be held liable for any damages
      6  * arising from the use of this software.
      7  *
      8  * Permission is granted to anyone to use this software for any purpose,
      9  * including commercial applications, and to alter it and redistribute it
     10  * freely, subject to the following restrictions:
     11  *
     12  * 1. The origin of this software must not be misrepresented; you must not
     13  *    claim that you wrote the original software. If you use this software
     14  *    in a product, an acknowledgment in the product documentation would be
     15  *    appreciated but is not required.
     16  * 2. Altered source versions must be plainly marked as such, and must not be
     17  *    misrepresented as being the original software.
     18  * 3. This notice may not be removed or altered from any source distribution.
     19  */
     20 
     21 /* Prevent the stack from becoming executable */
     22 #if defined(__linux__) && defined(__ELF__)
     23 .section .note.GNU-stack,"",%progbits
     24 #endif
     25 
     26 	.text
     27 	.arch armv6
     28 	.object_arch armv4
     29 	.arm
     30 	.altmacro
     31 	.p2align 2
     32 
     33 #include "pixman-arm-asm.h"
     34 #include "pixman-arm-simd-asm.h"
     35 
     36 /* A head macro should do all processing which results in an output of up to
     37  * 16 bytes, as far as the final load instruction. The corresponding tail macro
     38  * should complete the processing of the up-to-16 bytes. The calling macro will
     39  * sometimes choose to insert a preload or a decrement of X between them.
     40  *   cond           ARM condition code for code block
     41  *   numbytes       Number of output bytes that should be generated this time
     42  *   firstreg       First WK register in which to place output
     43  *   unaligned_src  Whether to use non-wordaligned loads of source image
     44  *   unaligned_mask Whether to use non-wordaligned loads of mask image
     45  *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
     46  */
     47 
     48 /******************************************************************************/
     49 
     50 .macro FillRect32_init
     51         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
     52         mov     STRIDE_S, SRC
     53         mov     MASK, SRC
     54         mov     STRIDE_M, SRC
     55 .endm
     56 
     57 .macro FillRect16_init
     58         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
     59         orr     SRC, SRC, lsl #16
     60         mov     STRIDE_S, SRC
     61         mov     MASK, SRC
     62         mov     STRIDE_M, SRC
     63 .endm
     64 
     65 .macro FillRect8_init
     66         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
     67         orr     SRC, SRC, lsl #8
     68         orr     SRC, SRC, lsl #16
     69         mov     STRIDE_S, SRC
     70         mov     MASK, SRC
     71         mov     STRIDE_M, SRC
     72 .endm
     73 
     74 .macro FillRect_process_tail  cond, numbytes, firstreg
     75     WK4     .req    SRC
     76     WK5     .req    STRIDE_S
     77     WK6     .req    MASK
     78     WK7     .req    STRIDE_M
     79         pixst   cond, numbytes, 4, DST
     80     .unreq  WK4
     81     .unreq  WK5
     82     .unreq  WK6
     83     .unreq  WK7
     84 .endm
     85 
     86 generate_composite_function \
     87     FillRect32ARMSIMDAsm, 0, 0, 32, \
     88     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
     89     0, /* prefetch distance doesn't apply */ \
     90     FillRect32_init \
     91     nop_macro, /* newline */ \
     92     nop_macro /* cleanup */ \
     93     nop_macro /* process head */ \
     94     FillRect_process_tail
     95 
     96 generate_composite_function \
     97     FillRect16ARMSIMDAsm, 0, 0, 16, \
     98     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
     99     0, /* prefetch distance doesn't apply */ \
    100     FillRect16_init \
    101     nop_macro, /* newline */ \
    102     nop_macro /* cleanup */ \
    103     nop_macro /* process head */ \
    104     FillRect_process_tail
    105 
    106 generate_composite_function \
    107     FillRect8ARMSIMDAsm, 0, 0, 8, \
    108     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    109     0, /* prefetch distance doesn't apply */ \
    110     FillRect8_init \
    111     nop_macro, /* newline */ \
    112     nop_macro /* cleanup */ \
    113     nop_macro /* process head */ \
    114     FillRect_process_tail
    115 
    116 /******************************************************************************/
    117 
    118 /* This differs from the over_8888_8888 routine in Pixman in that the destination
    119  * alpha component is always left unchanged, and RGB components are not
    120  * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that
    121  * renormalisation is done by multiplying by 257/256 (with rounding) rather than
    122  * simply shifting right by 8 bits - removing the need to special-case alpha=0xff.
    123  */
    124 
    125 .macro RGBtoRGBPixelAlpha_init
    126         line_saved_regs STRIDE_S, ORIG_W
    127         mov     MASK, #0x80
    128 .endm
    129 
    130 .macro RGBtoRGBPixelAlpha_1pixel_translucent  s, d, tmp0, tmp1, tmp2, tmp3, half
    131         uxtb    tmp3, s
    132         uxtb    tmp0, d
    133         sub     tmp0, tmp3, tmp0
    134         uxtb    tmp3, s, ror #16
    135         uxtb    tmp1, d, ror #16
    136         sub     tmp1, tmp3, tmp1
    137         uxtb    tmp3, s, ror #8
    138         mov     s, s, lsr #24
    139         uxtb    tmp2, d, ror #8
    140         sub     tmp2, tmp3, tmp2
    141         smlabb  tmp0, tmp0, s, half
    142         smlabb  tmp1, tmp1, s, half
    143         smlabb  tmp2, tmp2, s, half
    144         add     tmp0, tmp0, asr #8
    145         add     tmp1, tmp1, asr #8
    146         add     tmp2, tmp2, asr #8
    147         pkhbt   tmp0, tmp0, tmp1, lsl #16
    148         and     tmp2, tmp2, #0xff00
    149         uxtb16  tmp0, tmp0, ror #8
    150         orr     tmp0, tmp0, tmp2
    151         uadd8   d, d, tmp0
    152 .endm
    153 
    154 .macro RGBtoRGBPixelAlpha_1pixel_opaque  s, d
    155         and     d, d, #0xff000000
    156         bic     s, s, #0xff000000
    157         orr     d, d, s
    158 .endm
    159 
    160 .macro RGBtoRGBPixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    161  .if numbytes == 16
    162         ldm     SRC!, {WK0, WK1}
    163         ldm     SRC!, {STRIDE_S, STRIDE_M}
    164         ldrd    WK2, WK3, [DST], #16
    165         orr     SCRATCH, WK0, WK1
    166         and     ORIG_W, WK0, WK1
    167         orr     SCRATCH, SCRATCH, STRIDE_S
    168         and     ORIG_W, ORIG_W, STRIDE_S
    169         orr     SCRATCH, SCRATCH, STRIDE_M
    170         and     ORIG_W, ORIG_W, STRIDE_M
    171         tst     SCRATCH, #0xff000000
    172  .elseif numbytes == 8
    173         ldm     SRC!, {WK0, WK1}
    174         ldm     DST!, {WK2, WK3}
    175         orr     SCRATCH, WK0, WK1
    176         and     ORIG_W, WK0, WK1
    177         tst     SCRATCH, #0xff000000
    178  .else // numbytes == 4
    179         ldr     WK0, [SRC], #4
    180         ldr     WK2, [DST], #4
    181         tst     WK0, #0xff000000
    182  .endif
    183 .endm
    184 
    185 .macro RGBtoRGBPixelAlpha_process_tail  cond, numbytes, firstreg
    186         beq     20f @ all transparent
    187  .if numbytes == 16
    188         cmp     ORIG_W, #0xff000000
    189         bhs     10f @ all opaque
    190         RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
    191         RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
    192         strd    WK2, WK3, [DST, #-16]
    193         ldrd    WK0, WK1, [SRC, #-8]
    194         ldrd    WK2, WK3, [DST, #-8]
    195         RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
    196         RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
    197         b       19f
    198 10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
    199         RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
    200         strd    WK2, WK3, [DST, #-16]
    201         ldrd    WK0, WK1, [SRC, #-8]
    202         ldrd    WK2, WK3, [DST, #-8]
    203         RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
    204         RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
    205 19:     strd    WK2, WK3, [DST, #-8]
    206  .elseif numbytes == 8
    207         cmp     ORIG_W, #0xff000000
    208         bhs     10f @ all opaque
    209         RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
    210         RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
    211         b       19f
    212 10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
    213         RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
    214 19:     strd    WK2, WK3, [DST, #-8]
    215  .else // numbytes == 4
    216         cmp     WK0, #0xff000000
    217         bhs     10f @ opaque
    218         RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
    219         b       19f
    220 10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
    221 19:     str     WK2, [DST, #-4]
    222  .endif
    223 20:
    224 .endm
    225 
    226 generate_composite_function \
    227     BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \
    228     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
    229     2, /* prefetch distance */ \
    230     RGBtoRGBPixelAlpha_init, \
    231     nop_macro, /* newline */ \
    232     nop_macro, /* cleanup */ \
    233     RGBtoRGBPixelAlpha_process_head, \
    234     RGBtoRGBPixelAlpha_process_tail
    235 
    236 /******************************************************************************/
    237 
    238 .macro ARGBto565PixelAlpha_init
    239         line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
    240         mov     MASK, #0x001f
    241         mov     STRIDE_M, #0x0010
    242         orr     MASK, MASK, MASK, lsl #16
    243         orr     STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
    244 .endm
    245 
    246 .macro ARGBto565PixelAlpha_newline
    247         mov     STRIDE_S, #0x0200
    248 .endm
    249 
    250 /* On entry:
    251  * s1 holds 1 32bpp source pixel
    252  * d holds 1 16bpp destination pixel
    253  * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
    254  * other registers are temporaries
    255  * On exit:
    256  * Constant registers preserved
    257  */
    258 
    259 .macro ARGBto565PixelAlpha_1pixel_translucent  s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
    260         mov     alpha, s, lsr #27
    261         and     misc, s, #0xfc00
    262         and     g, d, #0x07e0
    263         pkhbt   rb, d, d, lsl #5
    264         rsb     misc, g, misc, lsr #5
    265         and     s, rbmask, s, lsr #3
    266         and     rb, rbmask, rb
    267         sub     s, s, rb
    268         smlabb  misc, misc, alpha, ghalf
    269         mla     s, s, alpha, rbhalf
    270         add     misc, misc, misc, lsl #5
    271         add     g, g, misc, asr #10
    272         add     s, s, s, lsl #5
    273         and     g, g, #0x07e0
    274         add     rb, rb, s, asr #10
    275         and     rb, rb, rbmask
    276         pkhbt   rb, rb, rb, lsl #11
    277         orr     d, rb, g
    278         orr     d, d, rb, lsr #16
    279 .endm
    280 
    281 /* On entry:
    282  * s1 holds 1 32bpp source pixel
    283  * d holds 1 16bpp destination pixel
    284  * rbmask holds 0x001f001f
    285  * On exit:
    286  * Constant registers preserved
    287  */
    288 
    289 .macro ARGBto565PixelAlpha_1pixel_opaque  s, d, rbmask
    290         and     d, rbmask, s, lsr #3
    291         and     s, s, #0xfc00
    292         orr     d, d, d, lsr #5
    293         orr     d, d, s, lsr #5
    294 .endm
    295 
    296 /* On entry:
    297  * s1, s2 hold 2 32bpp source pixels
    298  * d holds 2 16bpp destination pixels
    299  * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
    300  * other registers are temporaries
    301  * On exit:
    302  * Constant registers preserved
    303  * Blended results have been written through destination pointer
    304  */
    305 
    306 .macro ARGBto565PixelAlpha_2pixels_translucent  s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
    307         mov     alpha, s1, lsr #27
    308         and     misc, s1, #0xfc00
    309         and     g, d, #0x07e0
    310         pkhbt   rb, d, d, lsl #5
    311         rsb     misc, g, misc, lsr #5
    312         and     s1, rbmask, s1, lsr #3
    313         and     rb, rbmask, rb
    314         sub     s1, s1, rb
    315         smlabb  misc, misc, alpha, ghalf
    316         mla     s1, s1, alpha, rbhalf
    317           uxth    d, d, ror #16
    318         add     misc, misc, misc, lsl #5
    319           mov     alpha, s2, lsr #27
    320         add     g, g, misc, asr #10
    321         add     s1, s1, s1, lsl #5
    322         and     g, g, #0x07e0
    323         add     rb, rb, s1, asr #10
    324         and     rb, rb, rbmask
    325           and     misc, s2, #0xfc00
    326         pkhbt   rb, rb, rb, lsl #11
    327           and     s1, d, #0x07e0
    328           pkhbt   d, d, d, lsl #5
    329           rsb     misc, s1, misc, lsr #5
    330           and     s2, rbmask, s2, lsr #3
    331           and     d, rbmask, d
    332           sub     s2, s2, d
    333           smlabb  misc, misc, alpha, ghalf
    334           mla     s2, s2, alpha, rbhalf
    335         orr     alpha, rb, g
    336           add     misc, misc, misc, lsl #5
    337         orr     alpha, alpha, rb, lsr #16
    338           add     s1, s1, misc, asr #10
    339           add     s2, s2, s2, lsl #5
    340           and     s1, s1, #0x07e0
    341           add     d, d, s2, asr #10
    342           and     d, d, rbmask
    343         strh    alpha, [DST, #-4]
    344           pkhbt   d, d, d, lsl #11
    345           orr     alpha, d, s1
    346           orr     alpha, alpha, d, lsr #16
    347           strh    alpha, [DST, #-2]
    348 .endm
    349 
    350 /* On entry:
    351  * s1, s2 hold 2 32bpp source pixels
    352  * rbmask holds 0x001f001f
    353  * other registers are temporaries
    354  * On exit:
    355  * Constant registers preserved
    356  * Blended results have been written through destination pointer
    357  */
    358 
    359 .macro ARGBto565PixelAlpha_2pixels_opaque  s1, s2, d, rbmask, g
    360         and     g, s1, #0xfc00
    361         and     d, rbmask, s1, lsr #3
    362           and     s1, rbmask, s2, lsr #3
    363         orr     d, d, d, lsr #5
    364         orr     d, d, g, lsr #5
    365           and     g, s2, #0xfc00
    366         strh    d, [DST, #-4]
    367           orr     s1, s1, s1, lsr #5
    368           orr     s1, s1, g, lsr #5
    369           strh    s1, [DST, #-2]
    370 .endm
    371 
    372 .macro ARGBto565PixelAlpha_2pixels_head
    373         ldrd    WK0, WK1, [SRC], #8
    374         ldr     WK2, [DST], #4
    375         orr     SCRATCH, WK0, WK1
    376         and     ORIG_W, WK0, WK1
    377         tst     SCRATCH, #0xff000000
    378 .endm
    379 
    380 .macro ARGBto565PixelAlpha_2pixels_tail
    381         beq     20f @ all transparent
    382         cmp     ORIG_W, #0xff000000
    383         bhs     10f @ all opaque
    384         ARGBto565PixelAlpha_2pixels_translucent  WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
    385         b       20f
    386 10:     ARGBto565PixelAlpha_2pixels_opaque  WK0, WK1, WK2, MASK, SCRATCH
    387 20:
    388 .endm
    389 
    390 .macro ARGBto565PixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    391  .if numbytes == 16
    392         ARGBto565PixelAlpha_2pixels_head
    393         ARGBto565PixelAlpha_2pixels_tail
    394         ARGBto565PixelAlpha_2pixels_head
    395         ARGBto565PixelAlpha_2pixels_tail
    396  .endif
    397  .if numbytes >= 8
    398         ARGBto565PixelAlpha_2pixels_head
    399         ARGBto565PixelAlpha_2pixels_tail
    400  .endif
    401  .if numbytes >= 4
    402         ARGBto565PixelAlpha_2pixels_head
    403  .else // numbytes == 2
    404         ldr     WK0, [SRC], #4
    405         ldrh    WK2, [DST], #2
    406         tst     WK0, #0xff000000
    407  .endif
    408 .endm
    409 
    410 .macro ARGBto565PixelAlpha_process_tail  cond, numbytes, firstreg
    411  .if numbytes >= 4
    412         ARGBto565PixelAlpha_2pixels_tail
    413  .else // numbytes == 2
    414         beq     20f @ all transparent
    415         cmp     WK0, #0xff000000
    416         bhs     10f @ opaque
    417         ARGBto565PixelAlpha_1pixel_translucent  WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
    418         b       19f
    419 10:     ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
    420 19:     strh    WK2, [DST, #-2]
    421 20:
    422  .endif
    423 .endm
    424 
    425 generate_composite_function \
    426     BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
    427     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
    428     2, /* prefetch distance */ \
    429     ARGBto565PixelAlpha_init, \
    430     ARGBto565PixelAlpha_newline, \
    431     nop_macro, /* cleanup */ \
    432     ARGBto565PixelAlpha_process_head, \
    433     ARGBto565PixelAlpha_process_tail
    434 
    435  /******************************************************************************/
    436 
    437 .macro BGR888toRGB888_1pixel cond, reg, tmp
    438         uxtb16&cond  tmp, WK&reg, ror #8
    439         uxtb16&cond  WK&reg, WK&reg, ror #16
    440         orr&cond     WK&reg, WK&reg, tmp, lsl #8
    441 .endm
    442 
    443 .macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
    444         uxtb16&cond  tmp1, WK&reg1, ror #8
    445         uxtb16&cond  WK&reg1, WK&reg1, ror #16
    446         uxtb16&cond  tmp2, WK&reg2, ror #8
    447         uxtb16&cond  WK&reg2, WK&reg2, ror #16
    448         orr&cond     WK&reg1, WK&reg1, tmp1, lsl #8
    449         orr&cond     WK&reg2, WK&reg2, tmp2, lsl #8
    450 .endm
    451 
    452 .macro BGR888toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    453         pixld   cond, numbytes, firstreg, SRC, unaligned_src
    454 .endm
    455 
    456 .macro BGR888toRGB888_process_tail  cond, numbytes, firstreg
    457  .if numbytes >= 8
    458         BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
    459   .if numbytes == 16
    460         BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
    461   .endif
    462  .else @ numbytes == 4
    463         BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
    464  .endif
    465 .endm
    466 
    467 generate_composite_function \
    468     Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \
    469     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
    470     2, /* prefetch distance */ \
    471     nop_macro, /* init */ \
    472     nop_macro, /* newline */ \
    473     nop_macro, /* cleanup */ \
    474     BGR888toRGB888_process_head, \
    475     BGR888toRGB888_process_tail
    476 
    477 /******************************************************************************/
    478 
    479 .macro RGB444toRGB888_init
    480         ldr     MASK, =0x0f0f0f0f
    481         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
    482         msr     CPSR_s, #0x50000
    483 .endm
    484 
    485 .macro RGB444toRGB888_1pixel reg, mask, tmp
    486         pkhbt   WK&reg, WK&reg, WK&reg, lsl #12      @ 0000aaaarrrrggggaaaarrrrggggbbbb
    487         and     WK&reg, mask, WK&reg                 @ 0000aaaa0000gggg0000rrrr0000bbbb
    488         orr     WK&reg, WK&reg, WK&reg, lsl #4       @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
    489         pkhtb   tmp, WK&reg, WK&reg, asr #8          @ aaaaaaaaggggggggggggggggrrrrrrrr
    490         pkhbt   WK&reg, WK&reg, WK&reg, lsl #8       @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
    491         sel     WK&reg, WK&reg, tmp                  @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
    492 .endm
    493 
    494 .macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2
    495         and     tmp1, mask, WK&in                    @ 0000RRRR0000BBBB0000rrrr0000bbbb
    496         and     tmp2, mask, WK&in, lsr #4            @ 0000AAAA0000GGGG0000aaaa0000gggg
    497         orr     tmp1, tmp1, tmp1, lsl #4             @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb
    498         orr     tmp2, tmp2, tmp2, lsl #4             @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg
    499         pkhtb   WK&out2, tmp2, tmp1, asr #16         @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB
    500         pkhbt   WK&out1, tmp1, tmp2, lsl #16         @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
    501         pkhtb   tmp2, WK&out2, WK&out2, asr #8       @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR
    502         pkhtb   tmp1, WK&out1, WK&out1, asr #8       @ aaaaaaaaggggggggggggggggrrrrrrrr
    503         pkhbt   WK&out1, WK&out1, WK&out1, lsl #8    @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
    504         pkhbt   WK&out2, WK&out2, WK&out2, lsl #8    @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB
    505         sel     WK&out1, WK&out1, tmp1               @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
    506         sel     WK&out2, WK&out2, tmp2               @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB
    507 .endm
    508 
    509 .macro RGB444toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    510         pixld   cond, numbytes/2, firstreg, SRC, unaligned_src
    511 .endm
    512 
    513 .macro RGB444toRGB888_process_tail  cond, numbytes, firstreg
    514  .if numbytes >= 8
    515   .if numbytes == 16
    516         RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH
    517   .endif
    518         RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH
    519  .else @ numbytes == 4
    520         RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH
    521  .endif
    522 .endm
    523 
    524 generate_composite_function \
    525     Blit_RGB444_RGB888ARMSIMDAsm, 16, 0, 32, \
    526     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
    527     2, /* prefetch distance */ \
    528     RGB444toRGB888_init, \
    529     nop_macro, /* newline */ \
    530     nop_macro, /* cleanup */ \
    531     RGB444toRGB888_process_head, \
    532     RGB444toRGB888_process_tail