sdl

FORK: Simple Directmedia Layer
git clone https://git.neptards.moe/neptards/sdl.git
Log | Files | Refs

pixman-arm-simd-asm.h (36837B)


      1 /*
      2  * Copyright (c) 2012 Raspberry Pi Foundation
      3  * Copyright (c) 2012 RISC OS Open Ltd
      4  *
      5  * This software is provided 'as-is', without any express or implied
      6  * warranty.  In no event will the authors be held liable for any damages
      7  * arising from the use of this software.
      8  *
      9  * Permission is granted to anyone to use this software for any purpose,
     10  * including commercial applications, and to alter it and redistribute it
     11  * freely, subject to the following restrictions:
     12  *
     13  * 1. The origin of this software must not be misrepresented; you must not
     14  *    claim that you wrote the original software. If you use this software
     15  *    in a product, an acknowledgment in the product documentation would be
     16  *    appreciated but is not required.
     17  * 2. Altered source versions must be plainly marked as such, and must not be
     18  *    misrepresented as being the original software.
     19  * 3. This notice may not be removed or altered from any source distribution.
     20  */
     21 
     22 /*
     23  * Because the alignment of pixel data to cachelines, and even the number of
     24  * cachelines per row can vary from row to row, and because of the need to
     25  * preload each scanline once and only once, this prefetch strategy treats
     26  * each row of pixels independently. When a pixel row is long enough, there
     27  * are three distinct phases of prefetch:
     28  * * an inner loop section, where each time a cacheline of data is
     29  *    processed, another cacheline is preloaded (the exact distance ahead is
     30  *    determined empirically using profiling results from lowlevel-blt-bench)
     31  * * a leading section, where enough cachelines are preloaded to ensure no
     32  *    cachelines escape being preloaded when the inner loop starts
     33  * * a trailing section, where a limited number (0 or more) of cachelines
     34  *    are preloaded to deal with data (if any) that hangs off the end of the
     35  *    last iteration of the inner loop, plus any trailing bytes that were not
     36  *    enough to make up one whole iteration of the inner loop
     37  * 
     38  * There are (in general) three distinct code paths, selected between
     39  * depending upon how long the pixel row is. If it is long enough that there
     40  * is at least one iteration of the inner loop (as described above) then
     41  * this is described as the "wide" case. If it is shorter than that, but
     42  * there are still enough bytes output that there is at least one 16-byte-
     43  * long, 16-byte-aligned write to the destination (the optimum type of
     44  * write), then this is the "medium" case. If it is not even this long, then
     45  * this is the "narrow" case, and there is no attempt to align writes to
     46  * 16-byte boundaries. In the "medium" and "narrow" cases, all the
     47  * cachelines containing data from the pixel row are prefetched up-front.
     48  */
     49 
     50 /*
     51  * Determine whether we put the arguments on the stack for debugging.
     52  */
     53 #undef DEBUG_PARAMS
     54 
     55 /*
     56  * Bit flags for 'generate_composite_function' macro which are used
     57  * to tune generated functions behavior.
     58  */
     59 .set FLAG_DST_WRITEONLY,         0
     60 .set FLAG_DST_READWRITE,         1
     61 .set FLAG_COND_EXEC,             0
     62 .set FLAG_BRANCH_OVER,           2
     63 .set FLAG_PROCESS_PRESERVES_PSR, 0
     64 .set FLAG_PROCESS_CORRUPTS_PSR,  4
     65 .set FLAG_PROCESS_DOESNT_STORE,  0
     66 .set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
     67 .set FLAG_NO_SPILL_LINE_VARS,        0
     68 .set FLAG_SPILL_LINE_VARS_WIDE,      16
     69 .set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
     70 .set FLAG_SPILL_LINE_VARS,           48
     71 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
     72 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
     73 .set FLAG_PROCESS_PRESERVES_WK0,     0
     74 .set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
     75 .set FLAG_PRELOAD_DST,               0
     76 .set FLAG_NO_PRELOAD_DST,            256
     77 
     78 /*
     79  * Number of bytes by which to adjust preload offset of destination
     80  * buffer (allows preload instruction to be moved before the load(s))
     81  */
     82 .set DST_PRELOAD_BIAS, 0
     83 
     84 /*
     85  * Offset into stack where mask and source pointer/stride can be accessed.
     86  */
     87 #ifdef DEBUG_PARAMS
     88 .set ARGS_STACK_OFFSET,        (9*4+9*4)
     89 #else
     90 .set ARGS_STACK_OFFSET,        (9*4)
     91 #endif
     92 
     93 /*
     94  * Offset into stack where space allocated during init macro can be accessed.
     95  */
     96 .set LOCALS_STACK_OFFSET,     0
     97 
     98 /*
     99  * Constants for selecting preferable prefetch type.
    100  */
    101 .set PREFETCH_TYPE_NONE,       0
    102 .set PREFETCH_TYPE_STANDARD,   1
    103 
    104 /*
    105  * Definitions of macros for load/store of pixel data.
    106  */
    107 
    108 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
    109  .if numbytes == 16
    110   .if unaligned == 1
    111         op&r&cond    WK&reg0, [base], #4
    112         op&r&cond    WK&reg1, [base], #4
    113         op&r&cond    WK&reg2, [base], #4
    114         op&r&cond    WK&reg3, [base], #4
    115   .else
    116         op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
    117   .endif
    118  .elseif numbytes == 8
    119   .if unaligned == 1
    120         op&r&cond    WK&reg0, [base], #4
    121         op&r&cond    WK&reg1, [base], #4
    122   .else
    123         op&m&cond&ia base!, {WK&reg0,WK&reg1}
    124   .endif
    125  .elseif numbytes == 4
    126         op&r&cond    WK&reg0, [base], #4
    127  .elseif numbytes == 2
    128         op&r&cond&h  WK&reg0, [base], #2
    129  .elseif numbytes == 1
    130         op&r&cond&b  WK&reg0, [base], #1
    131  .else
    132   .error "unsupported size: numbytes"
    133  .endif
    134 .endm
    135 
    136 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
    137  .if numbytes == 16
    138         stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
    139  .elseif numbytes == 8
    140         stm&cond&db base, {WK&reg0,WK&reg1}
    141  .elseif numbytes == 4
    142         str&cond    WK&reg0, [base, #-4]
    143  .elseif numbytes == 2
    144         str&cond&h  WK&reg0, [base, #-2]
    145  .elseif numbytes == 1
    146         str&cond&b  WK&reg0, [base, #-1]
    147  .else
    148   .error "unsupported size: numbytes"
    149  .endif
    150 .endm
    151 
    152 .macro pixld cond, numbytes, firstreg, base, unaligned
    153         pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
    154 .endm
    155 
    156 .macro pixst cond, numbytes, firstreg, base
    157  .if (flags) & FLAG_DST_READWRITE
    158         pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
    159  .else
    160         pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
    161  .endif
    162 .endm
    163 
    164 .macro PF a, x:vararg
    165  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
    166         a x
    167  .endif
    168 .endm
    169 
    170 
    171 .macro preload_leading_step1  bpp, ptr, base
    172 /* If the destination is already 16-byte aligned, then we need to preload
    173  * between 0 and prefetch_distance (inclusive) cache lines ahead so there
    174  * are no gaps when the inner loop starts.
    175  */
    176  .if bpp > 0
    177         PF  bic,    ptr, base, #31
    178   .set OFFSET, 0
    179   .rept prefetch_distance+1
    180         PF  pld,    [ptr, #OFFSET]
    181    .set OFFSET, OFFSET+32
    182   .endr
    183  .endif
    184 .endm
    185 
    186 .macro preload_leading_step2  bpp, bpp_shift, ptr, base
    187 /* However, if the destination is not 16-byte aligned, we may need to
    188  * preload more cache lines than that. The question we need to ask is:
    189  * are the bytes corresponding to the leading pixels more than the amount
    190  * by which the source pointer will be rounded down for preloading, and if
    191  * so, by how many cache lines? Effectively, we want to calculate
    192  *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
    193  *     inner_loop_offset = (src+leading_bytes)&31
    194  *     extra_needed = leading_bytes - inner_loop_offset
    195  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
    196  * possible when there are 4 src bytes for every 1 dst byte).
    197  */
    198  .if bpp > 0
    199   .ifc base,DST
    200         /* The test can be simplified further when preloading the destination */
    201         PF  tst,    base, #16
    202         PF  beq,    61f
    203   .else
    204    .if bpp/dst_w_bpp == 4
    205         PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
    206         PF  and,    SCRATCH, SCRATCH, #31
    207         PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
    208         PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
    209         PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
    210         PF  bcs,    61f
    211         PF  bpl,    60f
    212         PF  pld,    [ptr, #32*(prefetch_distance+2)]
    213    .else
    214         PF  mov,    SCRATCH, base, lsl #32-5
    215         PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
    216         PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
    217         PF  bls,    61f
    218    .endif
    219   .endif
    220 60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
    221 61:
    222  .endif
    223 .endm
    224 
    225 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
    226 .macro preload_middle   bpp, base, scratch_holds_offset
    227  .if bpp > 0
    228         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
    229   .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
    230    .if scratch_holds_offset
    231         PF  pld,    [base, SCRATCH]
    232    .else
    233         PF  bic,    SCRATCH, base, #31
    234         PF  pld,    [SCRATCH, #32*prefetch_distance]
    235    .endif
    236   .endif
    237  .endif
    238 .endm
    239 
    240 .macro preload_trailing  bpp, bpp_shift, base
    241  .if bpp > 0
    242   .if bpp*pix_per_block > 256
    243         /* Calculations are more complex if more than one fetch per block */
    244         PF  and,    WK1, base, #31
    245         PF  add,    WK1, WK1, WK0, lsl #bpp_shift
    246         PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
    247         PF  bic,    SCRATCH, base, #31
    248 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
    249         PF  add,    SCRATCH, SCRATCH, #32
    250         PF  subs,   WK1, WK1, #32
    251         PF  bhi,    80b
    252   .else
    253         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
    254         PF  mov,    SCRATCH, base, lsl #32-5
    255         PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
    256         PF  adceqs, SCRATCH, SCRATCH, #0
    257         /* The instruction above has two effects: ensures Z is only
    258          * set if C was clear (so Z indicates that both shifted quantities
    259          * were 0), and clears C if Z was set (so C indicates that the sum
    260          * of the shifted quantities was greater and not equal to 32) */
    261         PF  beq,    82f
    262         PF  bic,    SCRATCH, base, #31
    263         PF  bcc,    81f
    264         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
    265 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
    266 82:
    267   .endif
    268  .endif
    269 .endm
    270 
    271 
    272 .macro preload_line    narrow_case, bpp, bpp_shift, base
    273 /* "narrow_case" - just means that the macro was invoked from the "narrow"
    274  *    code path rather than the "medium" one - because in the narrow case,
    275  *    the row of pixels is known to output no more than 30 bytes, then
    276  *    (assuming the source pixels are no wider than the the destination
    277  *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
    278  *    meaning there's no need for a loop.
    279  * "bpp" - number of bits per pixel in the channel (source, mask or
    280  *    destination) that's being preloaded, or 0 if this channel is not used
    281  *    for reading
    282  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
    283  * "base" - base address register of channel to preload (SRC, MASK or DST)
    284  */
    285  .if bpp > 0
    286   .if narrow_case && (bpp <= dst_w_bpp)
    287         /* In these cases, each line for each channel is in either 1 or 2 cache lines */
    288         PF  bic,    WK0, base, #31
    289         PF  pld,    [WK0]
    290         PF  add,    WK1, base, X, LSL #bpp_shift
    291         PF  sub,    WK1, WK1, #1
    292         PF  bic,    WK1, WK1, #31
    293         PF  cmp,    WK1, WK0
    294         PF  beq,    90f
    295         PF  pld,    [WK1]
    296 90:
    297   .else
    298         PF  bic,    WK0, base, #31
    299         PF  pld,    [WK0]
    300         PF  add,    WK1, base, X, lsl #bpp_shift
    301         PF  sub,    WK1, WK1, #1
    302         PF  bic,    WK1, WK1, #31
    303         PF  cmp,    WK1, WK0
    304         PF  beq,    92f
    305 91:     PF  add,    WK0, WK0, #32
    306         PF  cmp,    WK0, WK1
    307         PF  pld,    [WK0]
    308         PF  bne,    91b
    309 92:
    310   .endif
    311  .endif
    312 .endm
    313 
    314 
    315 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    316         process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
    317  .if decrementx
    318         sub&cond X, X, #8*numbytes/dst_w_bpp
    319  .endif
    320         process_tail  cond, numbytes, firstreg
    321  .if !((flags) & FLAG_PROCESS_DOES_STORE)
    322         pixst   cond, numbytes, firstreg, DST
    323  .endif
    324 .endm
    325 
    326 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    327  .if (flags) & FLAG_BRANCH_OVER
    328   .ifc cond,mi
    329         bpl     100f
    330   .endif
    331   .ifc cond,cs
    332         bcc     100f
    333   .endif
    334   .ifc cond,ne
    335         beq     100f
    336   .endif
    337         conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    338 100:
    339  .else
    340         conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    341  .endif
    342 .endm
    343 
    344 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
    345  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
    346         /* Can't interleave reads and writes */
    347         test
    348         conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
    349   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
    350         test
    351   .endif
    352         conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
    353  .else
    354         /* Can interleave reads and writes for better scheduling */
    355         test
    356         process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
    357         process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
    358   .if decrementx
    359         sub&cond1 X, X, #8*numbytes1/dst_w_bpp
    360         sub&cond2 X, X, #8*numbytes2/dst_w_bpp
    361   .endif
    362         process_tail  cond1, numbytes1, firstreg1
    363         process_tail  cond2, numbytes2, firstreg2
    364         pixst   cond1, numbytes1, firstreg1, DST
    365         pixst   cond2, numbytes2, firstreg2, DST
    366  .endif
    367 .endm
    368 
    369 
    370 .macro test_bits_1_0_ptr
    371  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    372         movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
    373  .else
    374         movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
    375  .endif
    376 .endm
    377 
    378 .macro test_bits_3_2_ptr
    379  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    380         movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
    381  .else
    382         movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
    383  .endif
    384 .endm
    385 
    386 .macro leading_15bytes  process_head, process_tail
    387         /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
    388  .set DECREMENT_X, 1
    389  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    390   .set DECREMENT_X, 0
    391         sub     X, X, WK0, lsr #dst_bpp_shift
    392         str     X, [sp, #LINE_SAVED_REG_COUNT*4]
    393         mov     X, WK0
    394  .endif
    395         /* Use unaligned loads in all cases for simplicity */
    396  .if dst_w_bpp == 8
    397         conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
    398  .elseif dst_w_bpp == 16
    399         test_bits_1_0_ptr
    400         conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
    401  .endif
    402         conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
    403  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    404         ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
    405  .endif
    406 .endm
    407 
    408 .macro test_bits_3_2_pix
    409         movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
    410 .endm
    411 
    412 .macro test_bits_1_0_pix
    413  .if dst_w_bpp == 8
    414         movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
    415  .else
    416         movs    SCRATCH, X, lsr #1
    417  .endif
    418 .endm
    419 
    420 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
    421         conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
    422  .if dst_w_bpp == 16
    423         test_bits_1_0_pix
    424         conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
    425  .elseif dst_w_bpp == 8
    426         conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
    427  .endif
    428 .endm
    429 
    430 
    431 .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
    432 110:
    433  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
    434  .rept pix_per_block*dst_w_bpp/128
    435         process_head  , 16, 0, unaligned_src, unaligned_mask, 1
    436   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    437         preload_middle  src_bpp, SRC, 1
    438   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    439         preload_middle  mask_bpp, MASK, 1
    440   .else
    441         preload_middle  src_bpp, SRC, 0
    442         preload_middle  mask_bpp, MASK, 0
    443   .endif
    444   .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
    445         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
    446          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
    447          * preloads for, to achieve staggered prefetches for multiple channels, because there are
    448          * always two STMs per prefetch, so there is always an opposite STM on which to put the
    449          * preload. Note, no need to BIC the base register here */
    450         PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
    451   .endif
    452         process_tail  , 16, 0
    453   .if !((flags) & FLAG_PROCESS_DOES_STORE)
    454         pixst   , 16, 0, DST
    455   .endif
    456   .set SUBBLOCK, SUBBLOCK+1
    457  .endr
    458         subs    X, X, #pix_per_block
    459         bhs     110b
    460 .endm
    461 
    462 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
    463         /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
    464  .if dst_r_bpp > 0
    465         tst     DST, #16
    466         bne     111f
    467         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
    468         b       112f
    469 111:
    470  .endif
    471         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
    472 112:
    473         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
    474  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
    475         PF  and,    WK0, X, #pix_per_block-1
    476  .endif
    477         preload_trailing  src_bpp, src_bpp_shift, SRC
    478         preload_trailing  mask_bpp, mask_bpp_shift, MASK
    479  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    480         preload_trailing  dst_r_bpp, dst_bpp_shift, DST
    481  .endif
    482         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
    483         /* The remainder of the line is handled identically to the medium case */
    484         medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
    485 .endm
    486 
    487 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
    488 120:
    489         process_head  , 16, 0, unaligned_src, unaligned_mask, 0
    490         process_tail  , 16, 0
    491  .if !((flags) & FLAG_PROCESS_DOES_STORE)
    492         pixst   , 16, 0, DST
    493  .endif
    494         subs    X, X, #128/dst_w_bpp
    495         bhs     120b
    496         /* Trailing pixels */
    497         tst     X, #128/dst_w_bpp - 1
    498         beq     exit_label
    499         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
    500 .endm
    501 
    502 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
    503         tst     X, #16*8/dst_w_bpp
    504         conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
    505         /* Trailing pixels */
    506         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
    507         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
    508 .endm
    509 
    510 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
    511  /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
    512  .if mask_bpp == 8 || mask_bpp == 16
    513         tst     MASK, #3
    514         bne     141f
    515  .endif
    516   .if src_bpp == 8 || src_bpp == 16
    517         tst     SRC, #3
    518         bne     140f
    519   .endif
    520         action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
    521   .if src_bpp == 8 || src_bpp == 16
    522         b       exit_label
    523 140:
    524         action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
    525   .endif
    526  .if mask_bpp == 8 || mask_bpp == 16
    527         b       exit_label
    528 141:
    529   .if src_bpp == 8 || src_bpp == 16
    530         tst     SRC, #3
    531         bne     142f
    532   .endif
    533         action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
    534   .if src_bpp == 8 || src_bpp == 16
    535         b       exit_label
    536 142:
    537         action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
    538   .endif
    539  .endif
    540 .endm
    541 
    542 
    543 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
    544  .if SINGLE_SCANLINE
    545   .ifc "last_one",""
    546         b       198f
    547   .endif
    548  .else
    549  .if vars_spilled
    550         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
    551         /* This is ldmia sp,{} */
    552         .word   0xE89D0000 | LINE_SAVED_REGS
    553  .endif
    554         subs    Y, Y, #1
    555  .if vars_spilled
    556   .if (LINE_SAVED_REGS) & (1<<1)
    557         str     Y, [sp]
    558   .endif
    559  .endif
    560         add     DST, DST, STRIDE_D
    561  .if src_bpp > 0
    562         add     SRC, SRC, STRIDE_S
    563  .endif
    564  .if mask_bpp > 0
    565         add     MASK, MASK, STRIDE_M
    566  .endif
    567  .if restore_x
    568         mov     X, ORIG_W
    569  .endif
    570         bhs     loop_label
    571  .ifc "last_one",""
    572   .if vars_spilled
    573         b       197f
    574   .else
    575         b       198f
    576   .endif
    577  .else
    578   .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
    579         b       198f
    580   .endif
    581  .endif
    582  .endif
    583 .endm
    584 
    585 
    586 .macro generate_composite_function_common fname, \
    587                                           src_bpp_, \
    588                                           mask_bpp_, \
    589                                           dst_w_bpp_, \
    590                                           flags_, \
    591                                           prefetch_distance_, \
    592                                           init, \
    593                                           newline, \
    594                                           cleanup, \
    595                                           process_head, \
    596                                           process_tail, \
    597                                           process_inner_loop
    598 
    599     pixman_asm_function fname
    600 
    601 /*
    602  * Make some macro arguments globally visible and accessible
    603  * from other macros
    604  */
    605  .set src_bpp, src_bpp_
    606  .set mask_bpp, mask_bpp_
    607  .set dst_w_bpp, dst_w_bpp_
    608  .set flags, flags_
    609  .set prefetch_distance, prefetch_distance_
    610 
    611 /*
    612  * Select prefetch type for this function.
    613  */
    614  .if prefetch_distance == 0
    615   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
    616  .else
    617   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
    618  .endif
    619 
    620  .if src_bpp == 32
    621   .set src_bpp_shift, 2
    622  .elseif src_bpp == 24
    623   .set src_bpp_shift, 0
    624  .elseif src_bpp == 16
    625   .set src_bpp_shift, 1
    626  .elseif src_bpp == 8
    627   .set src_bpp_shift, 0
    628  .elseif src_bpp == 0
    629   .set src_bpp_shift, -1
    630  .else
    631   .error "requested src bpp (src_bpp) is not supported"
    632  .endif
    633 
    634  .if mask_bpp == 32
    635   .set mask_bpp_shift, 2
    636  .elseif mask_bpp == 24
    637   .set mask_bpp_shift, 0
    638  .elseif mask_bpp == 8
    639   .set mask_bpp_shift, 0
    640  .elseif mask_bpp == 0
    641   .set mask_bpp_shift, -1
    642  .else
    643   .error "requested mask bpp (mask_bpp) is not supported"
    644  .endif
    645 
    646  .if dst_w_bpp == 32
    647   .set dst_bpp_shift, 2
    648  .elseif dst_w_bpp == 24
    649   .set dst_bpp_shift, 0
    650  .elseif dst_w_bpp == 16
    651   .set dst_bpp_shift, 1
    652  .elseif dst_w_bpp == 8
    653   .set dst_bpp_shift, 0
    654  .else
    655   .error "requested dst bpp (dst_w_bpp) is not supported"
    656  .endif
    657 
    658  .if (((flags) & FLAG_DST_READWRITE) != 0)
    659   .set dst_r_bpp, dst_w_bpp
    660  .else
    661   .set dst_r_bpp, 0
    662  .endif
    663 
    664  .set pix_per_block, 16*8/dst_w_bpp
    665  .if src_bpp != 0
    666   .if 32*8/src_bpp > pix_per_block
    667    .set pix_per_block, 32*8/src_bpp
    668   .endif
    669  .endif
    670  .if mask_bpp != 0
    671   .if 32*8/mask_bpp > pix_per_block
    672    .set pix_per_block, 32*8/mask_bpp
    673   .endif
    674  .endif
    675  .if dst_r_bpp != 0
    676   .if 32*8/dst_r_bpp > pix_per_block
    677    .set pix_per_block, 32*8/dst_r_bpp
    678   .endif
    679  .endif
    680 
    681 /* The standard entry conditions set up by pixman-arm-common.h are:
    682  * r0 = width (pixels)
    683  * r1 = height (rows)
    684  * r2 = pointer to top-left pixel of destination
    685  * r3 = destination stride (pixels)
    686  * [sp] = source pixel value, or pointer to top-left pixel of source
    687  * [sp,#4] = 0 or source stride (pixels)
    688  * The following arguments are unused for non-mask operations
    689  * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
    690  * [sp,#12] = 0 or mask stride (pixels)
    691  *
    692  * or in the single-scanline case:
    693  * r0 = width (pixels)
    694  * r1 = pointer to top-left pixel of destination
    695  * r2 = pointer to top-left pixel of source
    696  * The following argument is unused for non-mask operations
    697  * r3 = pointer to top-left pixel of mask
    698  */
    699 
    700 /*
    701  * Assign symbolic names to registers
    702  */
    703     X           .req    r0  /* pixels to go on this line */
    704  .if SINGLE_SCANLINE
    705     DST         .req    r1  /* destination pixel pointer */
    706     SRC         .req    r2  /* source pixel pointer */
    707     MASK        .req    r3  /* mask pixel pointer (if applicable) */
    708     Y           .req    r4  /* temporary */
    709     STRIDE_D    .req    r5  /* temporary */
    710     STRIDE_S    .req    r6  /* temporary */
    711     STRIDE_M    .req    r7  /* temporary */
    712  .else
    713     Y           .req    r1  /* lines to go */
    714     DST         .req    r2  /* destination pixel pointer */
    715     STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
    716     SRC         .req    r4  /* source pixel pointer */
    717     STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
    718     MASK        .req    r6  /* mask pixel pointer (if applicable) */
    719     STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
    720  .endif
    721     WK0         .req    r8  /* pixel data registers */
    722     WK1         .req    r9
    723     WK2         .req    r10
    724     WK3         .req    r11
    725     SCRATCH     .req    r12
    726     ORIG_W      .req    r14 /* width (pixels) */
    727 
    728         push    {r4-r11, lr}        /* save all registers */
    729 
    730  .if !SINGLE_SCANLINE
    731         subs    Y, Y, #1
    732         blo     199f
    733  .endif
    734 
    735 #ifdef DEBUG_PARAMS
    736         sub     sp, sp, #9*4
    737 #endif
    738 
    739  .if !SINGLE_SCANLINE
    740  .if src_bpp > 0
    741         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
    742         ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
    743  .endif
    744  .if mask_bpp > 0
    745         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
    746         ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
    747  .endif
    748  .endif
    749         
    750 #ifdef DEBUG_PARAMS
    751         add     Y, Y, #1
    752         stmia   sp, {r0-r7,pc}
    753         sub     Y, Y, #1
    754 #endif
    755 
    756         init
    757 
    758  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    759         /* Reserve a word in which to store X during leading pixels */
    760         sub     sp, sp, #4
    761   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
    762   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
    763  .endif
    764         
    765  .if !SINGLE_SCANLINE
    766         lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
    767         sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
    768  .if src_bpp > 0
    769         lsl     STRIDE_S, #src_bpp_shift
    770         sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
    771  .endif
    772  .if mask_bpp > 0
    773         lsl     STRIDE_M, #mask_bpp_shift
    774         sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
    775  .endif
    776  .endif
    777  
    778         /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
    779         cmp     X, #2*16*8/dst_w_bpp - 1
    780         blo     170f
    781  .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
    782         /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
    783         cmp     X, #(prefetch_distance+3)*pix_per_block - 1
    784         blo     160f
    785 
    786         /* Wide case */
    787         /* Adjust X so that the decrement instruction can also test for
    788          * inner loop termination. We want it to stop when there are
    789          * (prefetch_distance+1) complete blocks to go. */
    790         sub     X, X, #(prefetch_distance+2)*pix_per_block
    791   .if !SINGLE_SCANLINE
    792         mov     ORIG_W, X
    793   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
    794         /* This is stmdb sp!,{} */
    795         .word   0xE92D0000 | LINE_SAVED_REGS
    796    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
    797    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
    798   .endif
    799   .endif
    800 151:    /* New line */
    801         newline
    802         preload_leading_step1  src_bpp, WK1, SRC
    803         preload_leading_step1  mask_bpp, WK2, MASK
    804   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    805         preload_leading_step1  dst_r_bpp, WK3, DST
    806   .endif
    807         
    808         ands    WK0, DST, #15
    809         beq     154f
    810         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
    811 
    812         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
    813         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
    814   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    815         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
    816   .endif
    817 
    818         leading_15bytes  process_head, process_tail
    819         
    820 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
    821   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    822         and     SCRATCH, SRC, #31
    823         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
    824   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    825         and     SCRATCH, MASK, #31
    826         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
    827   .endif
    828   .ifc "process_inner_loop",""
    829         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
    830   .else
    831         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
    832   .endif
    833 
    834 157:    /* Check for another line */
    835         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
    836   .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE)
    837    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
    838    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
    839   .endif
    840  .endif
    841 
    842  .ltorg
    843 
    844 160:    /* Medium case */
    845  .if !SINGLE_SCANLINE
    846         mov     ORIG_W, X
    847  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
    848         /* This is stmdb sp!,{} */
    849         .word   0xE92D0000 | LINE_SAVED_REGS
    850   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
    851   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
    852  .endif
    853  .endif
    854 161:    /* New line */
    855         newline
    856         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
    857         preload_line 0, mask_bpp, mask_bpp_shift, MASK
    858  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    859         preload_line 0, dst_r_bpp, dst_bpp_shift, DST
    860  .endif
    861         
    862         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
    863         ands    WK0, DST, #15
    864         beq     164f
    865         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
    866         
    867         leading_15bytes  process_head, process_tail
    868         
    869 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
    870         switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
    871         
    872 167:    /* Check for another line */
    873         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
    874 
    875  .ltorg
    876 
    877 170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
    878  .if !SINGLE_SCANLINE
    879  .if dst_w_bpp < 32
    880         mov     ORIG_W, X
    881  .endif
    882  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
    883         /* This is stmdb sp!,{} */
    884         .word   0xE92D0000 | LINE_SAVED_REGS
    885  .endif
    886  .endif
    887 171:    /* New line */
    888         newline
    889         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
    890         preload_line 1, mask_bpp, mask_bpp_shift, MASK
    891  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    892         preload_line 1, dst_r_bpp, dst_bpp_shift, DST
    893  .endif
    894         
    895  .if dst_w_bpp == 8
    896         tst     DST, #3
    897         beq     174f
    898 172:    subs    X, X, #1
    899         blo     177f
    900         process_head  , 1, 0, 1, 1, 0
    901         process_tail  , 1, 0
    902   .if !((flags) & FLAG_PROCESS_DOES_STORE)
    903         pixst   , 1, 0, DST
    904   .endif
    905         tst     DST, #3
    906         bne     172b
    907  .elseif dst_w_bpp == 16
    908         tst     DST, #2
    909         beq     174f
    910         subs    X, X, #1
    911         blo     177f
    912         process_head  , 2, 0, 1, 1, 0
    913         process_tail  , 2, 0
    914   .if !((flags) & FLAG_PROCESS_DOES_STORE)
    915         pixst   , 2, 0, DST
    916   .endif
    917  .endif
    918 
    919 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
    920         switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
    921 
    922 177:    /* Check for another line */
    923         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
    924  .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE)
    925   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
    926   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
    927  .endif
    928 
    929 197:
    930  .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS)
    931         add     sp, sp, #LINE_SAVED_REG_COUNT*4
    932  .endif
    933 198:
    934  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    935   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
    936   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
    937         add     sp, sp, #4
    938  .endif
    939 
    940         cleanup
    941 
    942 #ifdef DEBUG_PARAMS
    943         add     sp, sp, #9*4 /* junk the debug copy of arguments */
    944 #endif
    945 199:
    946         pop     {r4-r11, pc}  /* exit */
    947 
    948  .ltorg
    949 
    950     .unreq  X
    951     .unreq  Y
    952     .unreq  DST
    953     .unreq  STRIDE_D
    954     .unreq  SRC
    955     .unreq  STRIDE_S
    956     .unreq  MASK
    957     .unreq  STRIDE_M
    958     .unreq  WK0
    959     .unreq  WK1
    960     .unreq  WK2
    961     .unreq  WK3
    962     .unreq  SCRATCH
    963     .unreq  ORIG_W
    964     .endfunc
    965 .endm
    966 
    967 .macro generate_composite_function fname, \
    968                                    src_bpp_, \
    969                                    mask_bpp_, \
    970                                    dst_w_bpp_, \
    971                                    flags_, \
    972                                    prefetch_distance_, \
    973                                    init, \
    974                                    newline, \
    975                                    cleanup, \
    976                                    process_head, \
    977                                    process_tail, \
    978                                    process_inner_loop
    979  .set SINGLE_SCANLINE, 0
    980 generate_composite_function_common \
    981     fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
    982     init, newline, cleanup, process_head, process_tail, process_inner_loop
    983 .endm
    984 
    985 .macro generate_composite_function_single_scanline fname, \
    986                                                    src_bpp_, \
    987                                                    mask_bpp_, \
    988                                                    dst_w_bpp_, \
    989                                                    flags_, \
    990                                                    prefetch_distance_, \
    991                                                    init, \
    992                                                    newline, \
    993                                                    cleanup, \
    994                                                    process_head, \
    995                                                    process_tail, \
    996                                                    process_inner_loop
    997  .set SINGLE_SCANLINE, 1
    998 generate_composite_function_common \
    999     fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
   1000     init, newline, cleanup, process_head, process_tail, process_inner_loop
   1001 .endm
   1002 
   1003 .macro line_saved_regs  x:vararg
   1004  .set LINE_SAVED_REGS, 0
   1005  .set LINE_SAVED_REG_COUNT, 0
   1006  .irp SAVED_REG,x
   1007   .ifc "SAVED_REG","Y"
   1008    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
   1009    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1010   .endif
   1011   .ifc "SAVED_REG","STRIDE_D"
   1012    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
   1013    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1014   .endif
   1015   .ifc "SAVED_REG","STRIDE_S"
   1016    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
   1017    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1018   .endif
   1019   .ifc "SAVED_REG","STRIDE_M"
   1020    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
   1021    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1022   .endif
   1023   .ifc "SAVED_REG","ORIG_W"
   1024    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
   1025    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
   1026   .endif
   1027  .endr
   1028  .if SINGLE_SCANLINE
   1029   .set LINE_SAVED_REG_COUNT, 0
   1030  .endif
   1031 .endm
   1032 
   1033 .macro nop_macro x:vararg
   1034 .endm