pixman-arm-simd-asm.S (19392B)
1 /* 2 * Copyright (c) 2016 RISC OS Open Ltd 3 * 4 * This software is provided 'as-is', without any express or implied 5 * warranty. In no event will the authors be held liable for any damages 6 * arising from the use of this software. 7 * 8 * Permission is granted to anyone to use this software for any purpose, 9 * including commercial applications, and to alter it and redistribute it 10 * freely, subject to the following restrictions: 11 * 12 * 1. The origin of this software must not be misrepresented; you must not 13 * claim that you wrote the original software. If you use this software 14 * in a product, an acknowledgment in the product documentation would be 15 * appreciated but is not required. 16 * 2. Altered source versions must be plainly marked as such, and must not be 17 * misrepresented as being the original software. 18 * 3. This notice may not be removed or altered from any source distribution. 19 */ 20 21 /* Prevent the stack from becoming executable */ 22 #if defined(__linux__) && defined(__ELF__) 23 .section .note.GNU-stack,"",%progbits 24 #endif 25 26 .text 27 .arch armv6 28 .object_arch armv4 29 .arm 30 .altmacro 31 .p2align 2 32 33 #include "pixman-arm-asm.h" 34 #include "pixman-arm-simd-asm.h" 35 36 /* A head macro should do all processing which results in an output of up to 37 * 16 bytes, as far as the final load instruction. The corresponding tail macro 38 * should complete the processing of the up-to-16 bytes. The calling macro will 39 * sometimes choose to insert a preload or a decrement of X between them. 40 * cond ARM condition code for code block 41 * numbytes Number of output bytes that should be generated this time 42 * firstreg First WK register in which to place output 43 * unaligned_src Whether to use non-wordaligned loads of source image 44 * unaligned_mask Whether to use non-wordaligned loads of mask image 45 * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output 46 */ 47 48 /******************************************************************************/ 49 50 .macro FillRect32_init 51 ldr SRC, [sp, #ARGS_STACK_OFFSET] 52 mov STRIDE_S, SRC 53 mov MASK, SRC 54 mov STRIDE_M, SRC 55 .endm 56 57 .macro FillRect16_init 58 ldrh SRC, [sp, #ARGS_STACK_OFFSET] 59 orr SRC, SRC, lsl #16 60 mov STRIDE_S, SRC 61 mov MASK, SRC 62 mov STRIDE_M, SRC 63 .endm 64 65 .macro FillRect8_init 66 ldrb SRC, [sp, #ARGS_STACK_OFFSET] 67 orr SRC, SRC, lsl #8 68 orr SRC, SRC, lsl #16 69 mov STRIDE_S, SRC 70 mov MASK, SRC 71 mov STRIDE_M, SRC 72 .endm 73 74 .macro FillRect_process_tail cond, numbytes, firstreg 75 WK4 .req SRC 76 WK5 .req STRIDE_S 77 WK6 .req MASK 78 WK7 .req STRIDE_M 79 pixst cond, numbytes, 4, DST 80 .unreq WK4 81 .unreq WK5 82 .unreq WK6 83 .unreq WK7 84 .endm 85 86 generate_composite_function \ 87 FillRect32ARMSIMDAsm, 0, 0, 32, \ 88 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 89 0, /* prefetch distance doesn't apply */ \ 90 FillRect32_init \ 91 nop_macro, /* newline */ \ 92 nop_macro /* cleanup */ \ 93 nop_macro /* process head */ \ 94 FillRect_process_tail 95 96 generate_composite_function \ 97 FillRect16ARMSIMDAsm, 0, 0, 16, \ 98 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 99 0, /* prefetch distance doesn't apply */ \ 100 FillRect16_init \ 101 nop_macro, /* newline */ \ 102 nop_macro /* cleanup */ \ 103 nop_macro /* process head */ \ 104 FillRect_process_tail 105 106 generate_composite_function \ 107 FillRect8ARMSIMDAsm, 0, 0, 8, \ 108 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 109 0, /* prefetch distance doesn't apply */ \ 110 FillRect8_init \ 111 nop_macro, /* newline */ \ 112 nop_macro /* cleanup */ \ 113 nop_macro /* process head */ \ 114 FillRect_process_tail 115 116 /******************************************************************************/ 117 118 /* This differs from the over_8888_8888 routine in Pixman in that the destination 119 * alpha component is always left unchanged, and RGB components are not 120 * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that 121 * renormalisation is done by multiplying by 257/256 (with rounding) rather than 122 * simply shifting right by 8 bits - removing the need to special-case alpha=0xff. 123 */ 124 125 .macro RGBtoRGBPixelAlpha_init 126 line_saved_regs STRIDE_S, ORIG_W 127 mov MASK, #0x80 128 .endm 129 130 .macro RGBtoRGBPixelAlpha_1pixel_translucent s, d, tmp0, tmp1, tmp2, tmp3, half 131 uxtb tmp3, s 132 uxtb tmp0, d 133 sub tmp0, tmp3, tmp0 134 uxtb tmp3, s, ror #16 135 uxtb tmp1, d, ror #16 136 sub tmp1, tmp3, tmp1 137 uxtb tmp3, s, ror #8 138 mov s, s, lsr #24 139 uxtb tmp2, d, ror #8 140 sub tmp2, tmp3, tmp2 141 smlabb tmp0, tmp0, s, half 142 smlabb tmp1, tmp1, s, half 143 smlabb tmp2, tmp2, s, half 144 add tmp0, tmp0, asr #8 145 add tmp1, tmp1, asr #8 146 add tmp2, tmp2, asr #8 147 pkhbt tmp0, tmp0, tmp1, lsl #16 148 and tmp2, tmp2, #0xff00 149 uxtb16 tmp0, tmp0, ror #8 150 orr tmp0, tmp0, tmp2 151 uadd8 d, d, tmp0 152 .endm 153 154 .macro RGBtoRGBPixelAlpha_1pixel_opaque s, d 155 and d, d, #0xff000000 156 bic s, s, #0xff000000 157 orr d, d, s 158 .endm 159 160 .macro RGBtoRGBPixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 161 .if numbytes == 16 162 ldm SRC!, {WK0, WK1} 163 ldm SRC!, {STRIDE_S, STRIDE_M} 164 ldrd WK2, WK3, [DST], #16 165 orr SCRATCH, WK0, WK1 166 and ORIG_W, WK0, WK1 167 orr SCRATCH, SCRATCH, STRIDE_S 168 and ORIG_W, ORIG_W, STRIDE_S 169 orr SCRATCH, SCRATCH, STRIDE_M 170 and ORIG_W, ORIG_W, STRIDE_M 171 tst SCRATCH, #0xff000000 172 .elseif numbytes == 8 173 ldm SRC!, {WK0, WK1} 174 ldm DST!, {WK2, WK3} 175 orr SCRATCH, WK0, WK1 176 and ORIG_W, WK0, WK1 177 tst SCRATCH, #0xff000000 178 .else // numbytes == 4 179 ldr WK0, [SRC], #4 180 ldr WK2, [DST], #4 181 tst WK0, #0xff000000 182 .endif 183 .endm 184 185 .macro RGBtoRGBPixelAlpha_process_tail cond, numbytes, firstreg 186 beq 20f @ all transparent 187 .if numbytes == 16 188 cmp ORIG_W, #0xff000000 189 bhs 10f @ all opaque 190 RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 191 RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 192 strd WK2, WK3, [DST, #-16] 193 ldrd WK0, WK1, [SRC, #-8] 194 ldrd WK2, WK3, [DST, #-8] 195 RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 196 RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 197 b 19f 198 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 199 RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 200 strd WK2, WK3, [DST, #-16] 201 ldrd WK0, WK1, [SRC, #-8] 202 ldrd WK2, WK3, [DST, #-8] 203 RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 204 RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 205 19: strd WK2, WK3, [DST, #-8] 206 .elseif numbytes == 8 207 cmp ORIG_W, #0xff000000 208 bhs 10f @ all opaque 209 RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 210 RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 211 b 19f 212 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 213 RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 214 19: strd WK2, WK3, [DST, #-8] 215 .else // numbytes == 4 216 cmp WK0, #0xff000000 217 bhs 10f @ opaque 218 RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 219 b 19f 220 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 221 19: str WK2, [DST, #-4] 222 .endif 223 20: 224 .endm 225 226 generate_composite_function \ 227 BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \ 228 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ 229 2, /* prefetch distance */ \ 230 RGBtoRGBPixelAlpha_init, \ 231 nop_macro, /* newline */ \ 232 nop_macro, /* cleanup */ \ 233 RGBtoRGBPixelAlpha_process_head, \ 234 RGBtoRGBPixelAlpha_process_tail 235 236 /******************************************************************************/ 237 238 .macro ARGBto565PixelAlpha_init 239 line_saved_regs STRIDE_D, STRIDE_S, ORIG_W 240 mov MASK, #0x001f 241 mov STRIDE_M, #0x0010 242 orr MASK, MASK, MASK, lsl #16 243 orr STRIDE_M, STRIDE_M, STRIDE_M, lsl #16 244 .endm 245 246 .macro ARGBto565PixelAlpha_newline 247 mov STRIDE_S, #0x0200 248 .endm 249 250 /* On entry: 251 * s1 holds 1 32bpp source pixel 252 * d holds 1 16bpp destination pixel 253 * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively 254 * other registers are temporaries 255 * On exit: 256 * Constant registers preserved 257 */ 258 259 .macro ARGBto565PixelAlpha_1pixel_translucent s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc 260 mov alpha, s, lsr #27 261 and misc, s, #0xfc00 262 and g, d, #0x07e0 263 pkhbt rb, d, d, lsl #5 264 rsb misc, g, misc, lsr #5 265 and s, rbmask, s, lsr #3 266 and rb, rbmask, rb 267 sub s, s, rb 268 smlabb misc, misc, alpha, ghalf 269 mla s, s, alpha, rbhalf 270 add misc, misc, misc, lsl #5 271 add g, g, misc, asr #10 272 add s, s, s, lsl #5 273 and g, g, #0x07e0 274 add rb, rb, s, asr #10 275 and rb, rb, rbmask 276 pkhbt rb, rb, rb, lsl #11 277 orr d, rb, g 278 orr d, d, rb, lsr #16 279 .endm 280 281 /* On entry: 282 * s1 holds 1 32bpp source pixel 283 * d holds 1 16bpp destination pixel 284 * rbmask holds 0x001f001f 285 * On exit: 286 * Constant registers preserved 287 */ 288 289 .macro ARGBto565PixelAlpha_1pixel_opaque s, d, rbmask 290 and d, rbmask, s, lsr #3 291 and s, s, #0xfc00 292 orr d, d, d, lsr #5 293 orr d, d, s, lsr #5 294 .endm 295 296 /* On entry: 297 * s1, s2 hold 2 32bpp source pixels 298 * d holds 2 16bpp destination pixels 299 * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively 300 * other registers are temporaries 301 * On exit: 302 * Constant registers preserved 303 * Blended results have been written through destination pointer 304 */ 305 306 .macro ARGBto565PixelAlpha_2pixels_translucent s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc 307 mov alpha, s1, lsr #27 308 and misc, s1, #0xfc00 309 and g, d, #0x07e0 310 pkhbt rb, d, d, lsl #5 311 rsb misc, g, misc, lsr #5 312 and s1, rbmask, s1, lsr #3 313 and rb, rbmask, rb 314 sub s1, s1, rb 315 smlabb misc, misc, alpha, ghalf 316 mla s1, s1, alpha, rbhalf 317 uxth d, d, ror #16 318 add misc, misc, misc, lsl #5 319 mov alpha, s2, lsr #27 320 add g, g, misc, asr #10 321 add s1, s1, s1, lsl #5 322 and g, g, #0x07e0 323 add rb, rb, s1, asr #10 324 and rb, rb, rbmask 325 and misc, s2, #0xfc00 326 pkhbt rb, rb, rb, lsl #11 327 and s1, d, #0x07e0 328 pkhbt d, d, d, lsl #5 329 rsb misc, s1, misc, lsr #5 330 and s2, rbmask, s2, lsr #3 331 and d, rbmask, d 332 sub s2, s2, d 333 smlabb misc, misc, alpha, ghalf 334 mla s2, s2, alpha, rbhalf 335 orr alpha, rb, g 336 add misc, misc, misc, lsl #5 337 orr alpha, alpha, rb, lsr #16 338 add s1, s1, misc, asr #10 339 add s2, s2, s2, lsl #5 340 and s1, s1, #0x07e0 341 add d, d, s2, asr #10 342 and d, d, rbmask 343 strh alpha, [DST, #-4] 344 pkhbt d, d, d, lsl #11 345 orr alpha, d, s1 346 orr alpha, alpha, d, lsr #16 347 strh alpha, [DST, #-2] 348 .endm 349 350 /* On entry: 351 * s1, s2 hold 2 32bpp source pixels 352 * rbmask holds 0x001f001f 353 * other registers are temporaries 354 * On exit: 355 * Constant registers preserved 356 * Blended results have been written through destination pointer 357 */ 358 359 .macro ARGBto565PixelAlpha_2pixels_opaque s1, s2, d, rbmask, g 360 and g, s1, #0xfc00 361 and d, rbmask, s1, lsr #3 362 and s1, rbmask, s2, lsr #3 363 orr d, d, d, lsr #5 364 orr d, d, g, lsr #5 365 and g, s2, #0xfc00 366 strh d, [DST, #-4] 367 orr s1, s1, s1, lsr #5 368 orr s1, s1, g, lsr #5 369 strh s1, [DST, #-2] 370 .endm 371 372 .macro ARGBto565PixelAlpha_2pixels_head 373 ldrd WK0, WK1, [SRC], #8 374 ldr WK2, [DST], #4 375 orr SCRATCH, WK0, WK1 376 and ORIG_W, WK0, WK1 377 tst SCRATCH, #0xff000000 378 .endm 379 380 .macro ARGBto565PixelAlpha_2pixels_tail 381 beq 20f @ all transparent 382 cmp ORIG_W, #0xff000000 383 bhs 10f @ all opaque 384 ARGBto565PixelAlpha_2pixels_translucent WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W 385 b 20f 386 10: ARGBto565PixelAlpha_2pixels_opaque WK0, WK1, WK2, MASK, SCRATCH 387 20: 388 .endm 389 390 .macro ARGBto565PixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 391 .if numbytes == 16 392 ARGBto565PixelAlpha_2pixels_head 393 ARGBto565PixelAlpha_2pixels_tail 394 ARGBto565PixelAlpha_2pixels_head 395 ARGBto565PixelAlpha_2pixels_tail 396 .endif 397 .if numbytes >= 8 398 ARGBto565PixelAlpha_2pixels_head 399 ARGBto565PixelAlpha_2pixels_tail 400 .endif 401 .if numbytes >= 4 402 ARGBto565PixelAlpha_2pixels_head 403 .else // numbytes == 2 404 ldr WK0, [SRC], #4 405 ldrh WK2, [DST], #2 406 tst WK0, #0xff000000 407 .endif 408 .endm 409 410 .macro ARGBto565PixelAlpha_process_tail cond, numbytes, firstreg 411 .if numbytes >= 4 412 ARGBto565PixelAlpha_2pixels_tail 413 .else // numbytes == 2 414 beq 20f @ all transparent 415 cmp WK0, #0xff000000 416 bhs 10f @ opaque 417 ARGBto565PixelAlpha_1pixel_translucent WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W 418 b 19f 419 10: ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK 420 19: strh WK2, [DST, #-2] 421 20: 422 .endif 423 .endm 424 425 generate_composite_function \ 426 BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \ 427 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ 428 2, /* prefetch distance */ \ 429 ARGBto565PixelAlpha_init, \ 430 ARGBto565PixelAlpha_newline, \ 431 nop_macro, /* cleanup */ \ 432 ARGBto565PixelAlpha_process_head, \ 433 ARGBto565PixelAlpha_process_tail 434 435 /******************************************************************************/ 436 437 .macro BGR888toRGB888_1pixel cond, reg, tmp 438 uxtb16&cond tmp, WK®, ror #8 439 uxtb16&cond WK®, WK®, ror #16 440 orr&cond WK®, WK®, tmp, lsl #8 441 .endm 442 443 .macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2 444 uxtb16&cond tmp1, WK®1, ror #8 445 uxtb16&cond WK®1, WK®1, ror #16 446 uxtb16&cond tmp2, WK®2, ror #8 447 uxtb16&cond WK®2, WK®2, ror #16 448 orr&cond WK®1, WK®1, tmp1, lsl #8 449 orr&cond WK®2, WK®2, tmp2, lsl #8 450 .endm 451 452 .macro BGR888toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 453 pixld cond, numbytes, firstreg, SRC, unaligned_src 454 .endm 455 456 .macro BGR888toRGB888_process_tail cond, numbytes, firstreg 457 .if numbytes >= 8 458 BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M 459 .if numbytes == 16 460 BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M 461 .endif 462 .else @ numbytes == 4 463 BGR888toRGB888_1pixel cond, %(firstreg+0), MASK 464 .endif 465 .endm 466 467 generate_composite_function \ 468 Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \ 469 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 470 2, /* prefetch distance */ \ 471 nop_macro, /* init */ \ 472 nop_macro, /* newline */ \ 473 nop_macro, /* cleanup */ \ 474 BGR888toRGB888_process_head, \ 475 BGR888toRGB888_process_tail 476 477 /******************************************************************************/ 478 479 .macro RGB444toRGB888_init 480 ldr MASK, =0x0f0f0f0f 481 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 482 msr CPSR_s, #0x50000 483 .endm 484 485 .macro RGB444toRGB888_1pixel reg, mask, tmp 486 pkhbt WK®, WK®, WK®, lsl #12 @ 0000aaaarrrrggggaaaarrrrggggbbbb 487 and WK®, mask, WK® @ 0000aaaa0000gggg0000rrrr0000bbbb 488 orr WK®, WK®, WK®, lsl #4 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb 489 pkhtb tmp, WK®, WK®, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr 490 pkhbt WK®, WK®, WK®, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb 491 sel WK®, WK®, tmp @ aaaaaaaarrrrrrrrggggggggbbbbbbbb 492 .endm 493 494 .macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2 495 and tmp1, mask, WK&in @ 0000RRRR0000BBBB0000rrrr0000bbbb 496 and tmp2, mask, WK&in, lsr #4 @ 0000AAAA0000GGGG0000aaaa0000gggg 497 orr tmp1, tmp1, tmp1, lsl #4 @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb 498 orr tmp2, tmp2, tmp2, lsl #4 @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg 499 pkhtb WK&out2, tmp2, tmp1, asr #16 @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB 500 pkhbt WK&out1, tmp1, tmp2, lsl #16 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb 501 pkhtb tmp2, WK&out2, WK&out2, asr #8 @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR 502 pkhtb tmp1, WK&out1, WK&out1, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr 503 pkhbt WK&out1, WK&out1, WK&out1, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb 504 pkhbt WK&out2, WK&out2, WK&out2, lsl #8 @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB 505 sel WK&out1, WK&out1, tmp1 @ aaaaaaaarrrrrrrrggggggggbbbbbbbb 506 sel WK&out2, WK&out2, tmp2 @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB 507 .endm 508 509 .macro RGB444toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 510 pixld cond, numbytes/2, firstreg, SRC, unaligned_src 511 .endm 512 513 .macro RGB444toRGB888_process_tail cond, numbytes, firstreg 514 .if numbytes >= 8 515 .if numbytes == 16 516 RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH 517 .endif 518 RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH 519 .else @ numbytes == 4 520 RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH 521 .endif 522 .endm 523 524 generate_composite_function \ 525 Blit_RGB444_RGB888ARMSIMDAsm, 16, 0, 32, \ 526 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ 527 2, /* prefetch distance */ \ 528 RGB444toRGB888_init, \ 529 nop_macro, /* newline */ \ 530 nop_macro, /* cleanup */ \ 531 RGB444toRGB888_process_head, \ 532 RGB444toRGB888_process_tail