sdl

FORK: Simple Directmedia Layer
git clone https://git.neptards.moe/neptards/sdl.git
Log | Files | Refs

pixman-arm-neon-asm.S (12847B)


      1 /*
      2  * Copyright © 2009 Nokia Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  *
     23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
     24  */
     25 
     26 /*
     27  * Copyright (c) 2018 RISC OS Open Ltd
     28  *
     29  * This software is provided 'as-is', without any express or implied
     30  * warranty.  In no event will the authors be held liable for any damages
     31  * arising from the use of this software.
     32  *
     33  * Permission is granted to anyone to use this software for any purpose,
     34  * including commercial applications, and to alter it and redistribute it
     35  * freely, subject to the following restrictions:
     36  *
     37  * 1. The origin of this software must not be misrepresented; you must not
     38  *    claim that you wrote the original software. If you use this software
     39  *    in a product, an acknowledgment in the product documentation would be
     40  *    appreciated but is not required.
     41  * 2. Altered source versions must be plainly marked as such, and must not be
     42  *    misrepresented as being the original software.
     43  * 3. This notice may not be removed or altered from any source distribution.
     44  */
     45 
     46 /* Prevent the stack from becoming executable for no reason... */
     47 #if defined(__linux__) && defined(__ELF__)
     48 .section .note.GNU-stack,"",%progbits
     49 #endif
     50 
     51     .text
     52     .fpu neon
     53     .arch armv7a
     54     .object_arch armv4
     55     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
     56     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
     57     .arm
     58     .altmacro
     59     .p2align 2
     60 
     61 #include "pixman-arm-asm.h"
     62 #include "pixman-arm-neon-asm.h"
     63 
     64 /* Global configuration options and preferences */
     65 
     66 /*
     67  * The code can optionally make use of unaligned memory accesses to improve
     68  * performance of handling leading/trailing pixels for each scanline.
     69  * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
     70  * example in linux if unaligned memory accesses are not configured to
     71  * generate.exceptions.
     72  */
     73 .set RESPECT_STRICT_ALIGNMENT, 1
     74 
     75 /*
     76  * Set default prefetch type. There is a choice between the following options:
     77  *
     78  * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
     79  * as NOP to workaround some HW bugs or for whatever other reason)
     80  *
     81  * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
     82  * advanced prefetch intruduces heavy overhead)
     83  *
     84  * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
     85  * which can run ARM and NEON instructions simultaneously so that extra ARM
     86  * instructions do not add (many) extra cycles, but improve prefetch efficiency)
     87  *
     88  * Note: some types of function can't support advanced prefetch and fallback
     89  *       to simple one (those which handle 24bpp pixels)
     90  */
     91 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
     92 
     93 /* Prefetch distance in pixels for simple prefetch */
     94 .set PREFETCH_DISTANCE_SIMPLE, 64
     95 
     96 /******************************************************************************/
     97 
     98 /* We can actually do significantly better than the Pixman macros, at least for
     99  * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
    100  * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
    101  */
    102 
    103 .macro generate_fillrect_function name, bpp, log2Bpp
    104 /*
    105  * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
    106  * On entry:
    107  * a1 = width, pixels
    108  * a2 = height, rows
    109  * a3 = pointer to top-left destination pixel
    110  * a4 = stride, pixels
    111  * [sp] = pixel value to fill with
    112  * Within the function:
    113  * v1 = width remaining
    114  * v2 = vst offset
    115  * v3 = alternate pointer
    116  * ip = data ARM register
    117  */
    118 pixman_asm_function name
    119     vld1.\bpp   {d0[],d1[]}, [sp]
    120     sub         a4, a1
    121     vld1.\bpp   {d2[],d3[]}, [sp]
    122     cmp         a1, #(15+64) >> \log2Bpp
    123     push        {v1-v3,lr}
    124     vmov        ip, s0
    125     blo         51f
    126 
    127     /* Long-row case */
    128     mov         v2, #64
    129 1:  mov         v1, a1
    130     ands        v3, a3, #15
    131     beq         2f
    132     /* Leading pixels */
    133     rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
    134     sub         v1, v1, v3, lsr #\log2Bpp
    135     rbit        v3, v3
    136 .if bpp <= 16
    137 .if bpp == 8
    138     tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
    139     strneb      ip, [a3], #1
    140     tst         v3, #1<<30
    141 .else
    142     tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
    143 .endif
    144     strneh      ip, [a3], #2
    145 .endif
    146     movs        v3, v3, lsl #3
    147     vstmcs      a3!, {s0}
    148     vstmmi      a3!, {d0}
    149 2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
    150     add         v3, a3, #32
    151     /* Inner loop */
    152 3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
    153     subs        v1, v1, #64 >> \log2Bpp
    154     vst1.\bpp   {q0-q1}, [v3 :128], v2
    155     bhs         3b
    156     /* Trailing pixels */
    157 4:  movs        v1, v1, lsl #27 + \log2Bpp
    158     bcc         5f
    159     vst1.\bpp   {q0-q1}, [a3 :128]!
    160 5:  bpl         6f
    161     vst1.\bpp   {q0}, [a3 :128]!
    162 6:  movs        v1, v1, lsl #2
    163     vstmcs      a3!, {d0}
    164     vstmmi      a3!, {s0}
    165 .if bpp <= 16
    166     movs        v1, v1, lsl #2
    167     strcsh      ip, [a3], #2
    168 .if bpp == 8
    169     strmib      ip, [a3], #1
    170 .endif
    171 .endif
    172     subs        a2, a2, #1
    173     add         a3, a3, a4, lsl #\log2Bpp
    174     bhi         1b
    175     pop         {v1-v3,pc}
    176 
    177     /* Short-row case */
    178 51: movs        v1, a1
    179 .if bpp == 8
    180     tst         a3, #3
    181     beq         53f
    182 52: subs        v1, v1, #1
    183     blo         57f
    184     strb        ip, [a3], #1
    185     tst         a3, #3
    186     bne         52b
    187 .elseif bpp == 16
    188     tstne       a3, #2
    189     subne       v1, v1, #1
    190     strneh      ip, [a3], #2
    191 .endif
    192 53: cmp         v1, #32 >> \log2Bpp
    193     bcc         54f
    194     vst1.\bpp   {q0-q1}, [a3]!
    195     sub         v1, v1, #32 >> \log2Bpp
    196     /* Trailing pixels */
    197 54: movs        v1, v1, lsl #27 + \log2Bpp
    198     bcc         55f
    199     vst1.\bpp   {q0-q1}, [a3]!
    200 55: bpl         56f
    201     vst1.\bpp   {q0}, [a3]!
    202 56: movs        v1, v1, lsl #2
    203     vstmcs      a3!, {d0}
    204     vstmmi      a3!, {s0}
    205 .if bpp <= 16
    206     movs        v1, v1, lsl #2
    207     strcsh      ip, [a3], #2
    208 .if bpp == 8
    209     strmib      ip, [a3], #1
    210 .endif
    211 .endif
    212     subs        a2, a2, #1
    213     add         a3, a3, a4, lsl #\log2Bpp
    214     bhi         51b
    215 57: pop         {v1-v3,pc}
    216 
    217 .endfunc
    218 .endm
    219 
    220 generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
    221 generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
    222 generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
    223 
    224 /******************************************************************************/
    225 
    226 .macro RGBtoRGBPixelAlpha_process_pixblock_head
    227     vmvn        d30, d3  /* get inverted source alpha */
    228     vmov        d31, d7  /* dest alpha is always unchanged */
    229     vmull.u8    q14, d0, d3
    230     vmlal.u8    q14, d4, d30
    231     vmull.u8    q0, d1, d3
    232     vmlal.u8    q0, d5, d30
    233     vmull.u8    q1, d2, d3
    234     vmlal.u8    q1, d6, d30
    235     vrshr.u16   q2, q14, #8
    236     vrshr.u16   q3, q0, #8
    237     vraddhn.u16 d28, q14, q2
    238     vrshr.u16   q2, q1, #8
    239     vraddhn.u16 d29, q0, q3
    240     vraddhn.u16 d30, q1, q2
    241 .endm
    242 
    243 .macro RGBtoRGBPixelAlpha_process_pixblock_tail
    244     /* nothing */
    245 .endm
    246 
    247 .macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
    248     vld4.8      {d0-d3}, [SRC]!
    249                                     PF add PF_X, PF_X, #8
    250         vst4.8      {d28-d31}, [DST_W :128]!
    251                                     PF tst PF_CTL, #0xF
    252     vld4.8      {d4-d7}, [DST_R :128]!
    253                                     PF addne PF_X, PF_X, #8
    254     vmvn        d30, d3  /* get inverted source alpha */
    255     vmov        d31, d7  /* dest alpha is always unchanged */
    256     vmull.u8    q14, d0, d3
    257                                     PF subne PF_CTL, PF_CTL, #1
    258     vmlal.u8    q14, d4, d30
    259                                     PF cmp PF_X, ORIG_W
    260     vmull.u8    q0, d1, d3
    261                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    262     vmlal.u8    q0, d5, d30
    263                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    264     vmull.u8    q1, d2, d3
    265                                     PF subge PF_X, PF_X, ORIG_W
    266     vmlal.u8    q1, d6, d30
    267                                     PF subges PF_CTL, PF_CTL, #0x10
    268     vrshr.u16   q2, q14, #8
    269                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    270     vrshr.u16   q3, q0, #8
    271                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    272     vraddhn.u16 d28, q14, q2
    273     vrshr.u16   q2, q1, #8
    274     vraddhn.u16 d29, q0, q3
    275     vraddhn.u16 d30, q1, q2
    276 .endm
    277 
    278 generate_composite_function \
    279     BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
    280     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    281     8, /* number of pixels, processed in a single block */ \
    282     5, /* prefetch distance */ \
    283     default_init, \
    284     default_cleanup, \
    285     RGBtoRGBPixelAlpha_process_pixblock_head, \
    286     RGBtoRGBPixelAlpha_process_pixblock_tail, \
    287     RGBtoRGBPixelAlpha_process_pixblock_tail_head
    288 
    289  /******************************************************************************/
    290 
    291 .macro ARGBto565PixelAlpha_process_pixblock_head
    292     vmvn        d6, d3
    293     vshr.u8     d1, #2
    294     vshr.u8     d3, #3
    295     vshr.u8     d0, #3
    296     vshrn.u16   d7, q2, #3
    297     vshrn.u16   d25, q2, #8
    298     vbic.i16    q2, #0xe0
    299     vshr.u8     d6, #3
    300     vshr.u8     d7, #2
    301     vshr.u8     d2, #3
    302     vmovn.u16   d24, q2
    303     vshr.u8     d25, #3
    304     vmull.u8    q13, d1, d3
    305     vmlal.u8    q13, d7, d6
    306     vmull.u8    q14, d0, d3
    307     vmlal.u8    q14, d24, d6
    308     vmull.u8    q15, d2, d3
    309     vmlal.u8    q15, d25, d6
    310 .endm
    311 
    312 .macro ARGBto565PixelAlpha_process_pixblock_tail
    313     vsra.u16    q13, #5
    314     vsra.u16    q14, #5
    315     vsra.u16    q15, #5
    316     vrshr.u16   q13, #5
    317     vrshr.u16   q14, #5
    318     vrshr.u16   q15, #5
    319     vsli.u16    q14, q13, #5
    320     vsli.u16    q14, q15, #11
    321 .endm
    322 
    323 .macro ARGBto565PixelAlpha_process_pixblock_tail_head
    324     vld4.8      {d0-d3}, [SRC]!
    325                                     PF add PF_X, PF_X, #8
    326         vsra.u16    q13, #5
    327                                     PF tst PF_CTL, #0xF
    328         vsra.u16    q14, #5
    329                                     PF addne PF_X, PF_X, #8
    330         vsra.u16    q15, #5
    331                                     PF subne PF_CTL, PF_CTL, #1
    332         vrshr.u16   q13, #5
    333                                     PF cmp PF_X, ORIG_W
    334         vrshr.u16   q14, #5
    335                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    336         vrshr.u16   q15, #5
    337                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    338     vld1.8      {d4-d5}, [DST_R]!
    339                                     PF subge PF_X, PF_X, ORIG_W
    340         vsli.u16    q14, q13, #5
    341                                     PF subges PF_CTL, PF_CTL, #0x10
    342         vsli.u16    q14, q15, #11
    343                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    344         vst1.8      {q14}, [DST_W :128]!
    345     vmvn        d6, d3
    346     vshr.u8     d1, #2
    347     vshr.u8     d3, #3
    348     vshr.u8     d0, #3
    349     vshrn.u16   d7, q2, #3
    350     vshrn.u16   d25, q2, #8
    351     vbic.i16    q2, #0xe0
    352                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    353     vshr.u8     d6, #3
    354     vshr.u8     d7, #2
    355     vshr.u8     d2, #3
    356     vmovn.u16   d24, q2
    357     vshr.u8     d25, #3
    358     vmull.u8    q13, d1, d3
    359     vmlal.u8    q13, d7, d6
    360     vmull.u8    q14, d0, d3
    361     vmlal.u8    q14, d24, d6
    362     vmull.u8    q15, d2, d3
    363     vmlal.u8    q15, d25, d6
    364 .endm
    365 
    366 generate_composite_function \
    367     BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
    368     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    369     8, /* number of pixels, processed in a single block */ \
    370     6, /* prefetch distance */ \
    371     default_init, \
    372     default_cleanup, \
    373     ARGBto565PixelAlpha_process_pixblock_head, \
    374     ARGBto565PixelAlpha_process_pixblock_tail, \
    375     ARGBto565PixelAlpha_process_pixblock_tail_head