2
* Copyright © 2009 Nokia Corporation
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
* DEALINGS IN THE SOFTWARE.
23
* Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27
* This file contains implementations of NEON optimized pixel processing
28
* functions. There is no full and detailed tutorial, but some functions
29
* (those which are exposing some new or interesting features) are
30
* extensively commented and can be used as examples.
32
* You may want to have a look at the comments for following functions:
33
* - pixman_composite_over_8888_0565_asm_neon
34
* - pixman_composite_over_n_8_0565_asm_neon
37
/* Prevent the stack from becoming executable for no reason... */
38
#if defined(__linux__) && defined(__ELF__)
39
.section .note.GNU-stack,"",%progbits
46
.eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47
.eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
52
#include "pixman-private.h"
53
#include "pixman-arm-neon-asm.h"
55
/* Global configuration options and preferences */
58
* The code can optionally make use of unaligned memory accesses to improve
59
* performance of handling leading/trailing pixels for each scanline.
60
* Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
61
* example in linux if unaligned memory accesses are not configured to
62
* generate.exceptions.
64
.set RESPECT_STRICT_ALIGNMENT, 1
67
* Set default prefetch type. There is a choice between the following options:
69
* PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
70
* as NOP to workaround some HW bugs or for whatever other reason)
72
* PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
73
* advanced prefetch intruduces heavy overhead)
75
* PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
76
* which can run ARM and NEON instructions simultaneously so that extra ARM
77
* instructions do not add (many) extra cycles, but improve prefetch efficiency)
79
* Note: some types of function can't support advanced prefetch and fallback
80
* to simple one (those which handle 24bpp pixels)
82
.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
84
/* Prefetch distance in pixels for simple prefetch */
85
.set PREFETCH_DISTANCE_SIMPLE, 64
88
* Implementation of pixman_composite_over_8888_0565_asm_neon
90
* This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
91
* performs OVER compositing operation. Function fast_composite_over_8888_0565
92
* from pixman-fast-path.c does the same in C and can be used as a reference.
94
* First we need to have some NEON assembly code which can do the actual
95
* operation on the pixels and provide it to the template macro.
97
* Template macro quite conveniently takes care of emitting all the necessary
98
* code for memory reading and writing (including quite tricky cases of
99
* handling unaligned leading/trailing pixels), so we only need to deal with
100
* the data in NEON registers.
102
* NEON registers allocation in general is recommented to be the following:
103
* d0, d1, d2, d3 - contain loaded source pixel data
104
* d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
105
* d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
106
* d28, d29, d30, d31 - place for storing the result (destination pixels)
108
* As can be seen above, four 64-bit NEON registers are used for keeping
109
* intermediate pixel data and up to 8 pixels can be processed in one step
110
* for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
112
* This particular function uses the following registers allocation:
113
* d0, d1, d2, d3 - contain loaded source pixel data
114
* d4, d5 - contain loaded destination pixels (they are needed)
115
* d28, d29 - place for storing the result (destination pixels)
119
* Step one. We need to have some code to do some arithmetics on pixel data.
120
* This is implemented as a pair of macros: '*_head' and '*_tail'. When used
121
* back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
122
* perform all the needed calculations and write the result to {d28, d29}.
123
* The rationale for having two macros and not just one will be explained
124
* later. In practice, any single monolitic function which does the work can
125
* be split into two parts in any arbitrary way without affecting correctness.
127
* There is one special trick here too. Common template macro can optionally
128
* make our life a bit easier by doing R, G, B, A color components
129
* deinterleaving for 32bpp pixel formats (and this feature is used in
130
* 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
131
* instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
132
* actually use d0 register for blue channel (a vector of eight 8-bit
133
* values), d1 register for green, d2 for red and d3 for alpha. This
134
* simple conversion can be also done with a few NEON instructions:
136
* Packed to planar conversion:
142
* Planar to packed conversion:
148
* But pixel can be loaded directly in planar format using VLD4.8 NEON
149
* instruction. It is 1 cycle slower than VLD1.32, so this is not always
150
* desirable, that's why deinterleaving is optional.
152
* But anyway, here is the code:
154
.macro pixman_composite_over_8888_0565_process_pixblock_head
155
/* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
156
and put data into d6 - red, d7 - green, d30 - blue */
161
vmvn.8 d3, d3 /* invert source alpha */
163
vshrn.u16 d30, q2, #2
164
/* now do alpha blending, storing results in 8-bit planar format
165
into d16 - red, d19 - green, d18 - blue */
168
vmull.u8 q12, d3, d30
169
vrshr.u16 q13, q10, #8
170
vrshr.u16 q3, q11, #8
171
vrshr.u16 q15, q12, #8
172
vraddhn.u16 d20, q10, q13
173
vraddhn.u16 d23, q11, q3
174
vraddhn.u16 d22, q12, q15
177
.macro pixman_composite_over_8888_0565_process_pixblock_tail
178
/* ... continue alpha blending */
179
vqadd.u8 d16, d2, d20
181
/* convert the result to r5g6b5 and store it into {d28, d29} */
182
vshll.u8 q14, d16, #8
186
vsri.u16 q14, q9, #11
190
* OK, now we got almost everything that we need. Using the above two
191
* macros, the work can be done right. But now we want to optimize
192
* it a bit. ARM Cortex-A8 is an in-order core, and benefits really
193
* a lot from good code scheduling and software pipelining.
195
* Let's construct some code, which will run in the core main loop.
196
* Some pseudo-code of the main loop will look like this:
204
* It may look a bit weird, but this setup allows to hide instruction
205
* latencies better and also utilize dual-issue capability more
206
* efficiently (make pairs of load-store and ALU instructions).
208
* So what we need now is a '*_tail_head' macro, which will be used
209
* in the core main loop. A trivial straightforward implementation
210
* of this macro would look like this:
212
* pixman_composite_over_8888_0565_process_pixblock_tail
213
* vst1.16 {d28, d29}, [DST_W, :128]!
214
* vld1.16 {d4, d5}, [DST_R, :128]!
215
* vld4.32 {d0, d1, d2, d3}, [SRC]!
216
* pixman_composite_over_8888_0565_process_pixblock_head
219
* Now it also got some VLD/VST instructions. We simply can't move from
220
* processing one block of pixels to the other one with just arithmetics.
221
* The previously processed data needs to be written to memory and new
222
* data needs to be fetched. Fortunately, this main loop does not deal
223
* with partial leading/trailing pixels and can load/store a full block
224
* of pixels in a bulk. Additionally, destination buffer is already
225
* 16 bytes aligned here (which is good for performance).
227
* New things here are DST_R, DST_W, SRC and MASK identifiers. These
228
* are the aliases for ARM registers which are used as pointers for
229
* accessing data. We maintain separate pointers for reading and writing
230
* destination buffer (DST_R and DST_W).
232
* Another new thing is 'cache_preload' macro. It is used for prefetching
233
* data into CPU L2 cache and improve performance when dealing with large
234
* images which are far larger than cache size. It uses one argument
235
* (actually two, but they need to be the same here) - number of pixels
236
* in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
237
* details about this macro. Moreover, if good performance is needed
238
* the code from this macro needs to be copied into '*_tail_head' macro
239
* and mixed with the rest of code for optimal instructions scheduling.
240
* We are actually doing it below.
242
* Now after all the explanations, here is the optimized code.
243
* Different instruction streams (originaling from '*_head', '*_tail'
244
* and 'cache_preload' macro) use different indentation levels for
245
* better readability. Actually taking the code from one of these
246
* indentation levels and ignoring a few VLD/VST instructions would
247
* result in exactly the code from '*_head', '*_tail' or 'cache_preload'
253
.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
254
vqadd.u8 d16, d2, d20
255
vld1.16 {d4, d5}, [DST_R, :128]!
261
vshll.u8 q14, d16, #8
262
PF add PF_X, PF_X, #8
266
PF addne PF_X, PF_X, #8
268
PF subne PF_CTL, PF_CTL, #1
270
vshrn.u16 d30, q2, #2
272
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
274
vmull.u8 q12, d3, d30
275
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
279
vrshr.u16 q13, q10, #8
280
PF subge PF_X, PF_X, ORIG_W
281
vrshr.u16 q3, q11, #8
282
vrshr.u16 q15, q12, #8
283
PF subges PF_CTL, PF_CTL, #0x10
284
vsri.u16 q14, q9, #11
285
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
286
vraddhn.u16 d20, q10, q13
287
vraddhn.u16 d23, q11, q3
288
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
289
vraddhn.u16 d22, q12, q15
290
vst1.16 {d28, d29}, [DST_W, :128]!
295
/* If we did not care much about the performance, we would just use this... */
296
.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
297
pixman_composite_over_8888_0565_process_pixblock_tail
298
vst1.16 {d28, d29}, [DST_W, :128]!
299
vld1.16 {d4, d5}, [DST_R, :128]!
301
pixman_composite_over_8888_0565_process_pixblock_head
308
* And now the final part. We are using 'generate_composite_function' macro
309
* to put all the stuff together. We are specifying the name of the function
310
* which we want to get, number of bits per pixel for the source, mask and
311
* destination (0 if unused, like mask in this case). Next come some bit
313
* FLAG_DST_READWRITE - tells that the destination buffer is both read
314
* and written, for write-only buffer we would use
315
* FLAG_DST_WRITEONLY flag instead
316
* FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
317
* and separate color channels for 32bpp format.
318
* The next things are:
319
* - the number of pixels processed per iteration (8 in this case, because
320
* that's the maximum what can fit into four 64-bit NEON registers).
321
* - prefetch distance, measured in pixel blocks. In this case it is 5 times
322
* by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
323
* prefetch distance can be selected by running some benchmarks.
325
* After that we specify some macros, these are 'default_init',
326
* 'default_cleanup' here which are empty (but it is possible to have custom
327
* init/cleanup macros to be able to save/restore some extra NEON registers
328
* like d8-d15 or do anything else) followed by
329
* 'pixman_composite_over_8888_0565_process_pixblock_head',
330
* 'pixman_composite_over_8888_0565_process_pixblock_tail' and
331
* 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
332
* which we got implemented above.
334
* The last part is the NEON registers allocation scheme.
336
generate_composite_function \
337
pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
338
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
339
8, /* number of pixels, processed in a single block */ \
340
5, /* prefetch distance */ \
343
pixman_composite_over_8888_0565_process_pixblock_head, \
344
pixman_composite_over_8888_0565_process_pixblock_tail, \
345
pixman_composite_over_8888_0565_process_pixblock_tail_head, \
346
28, /* dst_w_basereg */ \
347
4, /* dst_r_basereg */ \
348
0, /* src_basereg */ \
349
24 /* mask_basereg */
351
/******************************************************************************/
353
.macro pixman_composite_over_n_0565_process_pixblock_head
354
/* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
355
and put data into d6 - red, d7 - green, d30 - blue */
361
vshrn.u16 d30, q2, #2
362
/* now do alpha blending, storing results in 8-bit planar format
363
into d16 - red, d19 - green, d18 - blue */
366
vmull.u8 q12, d3, d30
367
vrshr.u16 q13, q10, #8
368
vrshr.u16 q3, q11, #8
369
vrshr.u16 q15, q12, #8
370
vraddhn.u16 d20, q10, q13
371
vraddhn.u16 d23, q11, q3
372
vraddhn.u16 d22, q12, q15
375
.macro pixman_composite_over_n_0565_process_pixblock_tail
376
/* ... continue alpha blending */
377
vqadd.u8 d16, d2, d20
379
/* convert the result to r5g6b5 and store it into {d28, d29} */
380
vshll.u8 q14, d16, #8
384
vsri.u16 q14, q9, #11
387
/* TODO: expand macros and do better instructions scheduling */
388
.macro pixman_composite_over_n_0565_process_pixblock_tail_head
389
pixman_composite_over_n_0565_process_pixblock_tail
390
vld1.16 {d4, d5}, [DST_R, :128]!
391
vst1.16 {d28, d29}, [DST_W, :128]!
392
pixman_composite_over_n_0565_process_pixblock_head
396
.macro pixman_composite_over_n_0565_init
397
add DUMMY, sp, #ARGS_STACK_OFFSET
398
vld1.32 {d3[0]}, [DUMMY]
403
vmvn.8 d3, d3 /* invert source alpha */
406
generate_composite_function \
407
pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
408
FLAG_DST_READWRITE, \
409
8, /* number of pixels, processed in a single block */ \
410
5, /* prefetch distance */ \
411
pixman_composite_over_n_0565_init, \
413
pixman_composite_over_n_0565_process_pixblock_head, \
414
pixman_composite_over_n_0565_process_pixblock_tail, \
415
pixman_composite_over_n_0565_process_pixblock_tail_head, \
416
28, /* dst_w_basereg */ \
417
4, /* dst_r_basereg */ \
418
0, /* src_basereg */ \
419
24 /* mask_basereg */
421
/******************************************************************************/
423
.macro pixman_composite_src_8888_0565_process_pixblock_head
429
.macro pixman_composite_src_8888_0565_process_pixblock_tail
431
vsri.u16 q14, q9, #11
434
.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
436
PF add PF_X, PF_X, #8
439
PF addne PF_X, PF_X, #8
440
PF subne PF_CTL, PF_CTL, #1
441
vsri.u16 q14, q9, #11
443
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
445
vst1.16 {d28, d29}, [DST_W, :128]!
446
PF subge PF_X, PF_X, ORIG_W
447
PF subges PF_CTL, PF_CTL, #0x10
449
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
453
generate_composite_function \
454
pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
455
FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
456
8, /* number of pixels, processed in a single block */ \
457
10, /* prefetch distance */ \
460
pixman_composite_src_8888_0565_process_pixblock_head, \
461
pixman_composite_src_8888_0565_process_pixblock_tail, \
462
pixman_composite_src_8888_0565_process_pixblock_tail_head
464
/******************************************************************************/
466
.macro pixman_composite_src_0565_8888_process_pixblock_head
467
vshrn.u16 d30, q0, #8
468
vshrn.u16 d29, q0, #3
473
vshrn.u16 d28, q0, #2
476
.macro pixman_composite_src_0565_8888_process_pixblock_tail
479
/* TODO: expand macros and do better instructions scheduling */
480
.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
481
pixman_composite_src_0565_8888_process_pixblock_tail
482
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
484
pixman_composite_src_0565_8888_process_pixblock_head
488
generate_composite_function \
489
pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
490
FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
491
8, /* number of pixels, processed in a single block */ \
492
10, /* prefetch distance */ \
495
pixman_composite_src_0565_8888_process_pixblock_head, \
496
pixman_composite_src_0565_8888_process_pixblock_tail, \
497
pixman_composite_src_0565_8888_process_pixblock_tail_head
499
/******************************************************************************/
501
.macro pixman_composite_add_8_8_process_pixblock_head
506
.macro pixman_composite_add_8_8_process_pixblock_tail
509
.macro pixman_composite_add_8_8_process_pixblock_tail_head
511
PF add PF_X, PF_X, #32
513
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
514
PF addne PF_X, PF_X, #32
515
PF subne PF_CTL, PF_CTL, #1
516
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
518
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
519
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
520
PF subge PF_X, PF_X, ORIG_W
521
PF subges PF_CTL, PF_CTL, #0x10
523
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
524
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
528
generate_composite_function \
529
pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
530
FLAG_DST_READWRITE, \
531
32, /* number of pixels, processed in a single block */ \
532
10, /* prefetch distance */ \
535
pixman_composite_add_8_8_process_pixblock_head, \
536
pixman_composite_add_8_8_process_pixblock_tail, \
537
pixman_composite_add_8_8_process_pixblock_tail_head
539
/******************************************************************************/
541
.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
543
PF add PF_X, PF_X, #8
545
vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
546
PF addne PF_X, PF_X, #8
547
PF subne PF_CTL, PF_CTL, #1
548
vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
550
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
551
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
552
PF subge PF_X, PF_X, ORIG_W
553
PF subges PF_CTL, PF_CTL, #0x10
555
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
556
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
560
generate_composite_function \
561
pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
562
FLAG_DST_READWRITE, \
563
8, /* number of pixels, processed in a single block */ \
564
10, /* prefetch distance */ \
567
pixman_composite_add_8_8_process_pixblock_head, \
568
pixman_composite_add_8_8_process_pixblock_tail, \
569
pixman_composite_add_8888_8888_process_pixblock_tail_head
571
generate_composite_function_single_scanline \
572
pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
573
FLAG_DST_READWRITE, \
574
8, /* number of pixels, processed in a single block */ \
577
pixman_composite_add_8_8_process_pixblock_head, \
578
pixman_composite_add_8_8_process_pixblock_tail, \
579
pixman_composite_add_8888_8888_process_pixblock_tail_head
581
/******************************************************************************/
583
.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
584
vmvn.8 d24, d3 /* get inverted alpha */
585
/* do alpha blending */
588
vmull.u8 q10, d24, d6
589
vmull.u8 q11, d24, d7
592
.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
593
vrshr.u16 q14, q8, #8
594
vrshr.u16 q15, q9, #8
595
vrshr.u16 q12, q10, #8
596
vrshr.u16 q13, q11, #8
597
vraddhn.u16 d28, q14, q8
598
vraddhn.u16 d29, q15, q9
599
vraddhn.u16 d30, q12, q10
600
vraddhn.u16 d31, q13, q11
603
.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
604
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
605
vrshr.u16 q14, q8, #8
606
PF add PF_X, PF_X, #8
608
vrshr.u16 q15, q9, #8
609
vrshr.u16 q12, q10, #8
610
vrshr.u16 q13, q11, #8
611
PF addne PF_X, PF_X, #8
612
PF subne PF_CTL, PF_CTL, #1
613
vraddhn.u16 d28, q14, q8
614
vraddhn.u16 d29, q15, q9
616
vraddhn.u16 d30, q12, q10
617
vraddhn.u16 d31, q13, q11
619
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
621
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
622
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
623
PF subge PF_X, PF_X, ORIG_W
625
PF subges PF_CTL, PF_CTL, #0x10
627
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
628
vmull.u8 q10, d22, d6
629
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
630
vmull.u8 q11, d22, d7
633
generate_composite_function_single_scanline \
634
pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
635
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
636
8, /* number of pixels, processed in a single block */ \
639
pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
640
pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
641
pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
643
/******************************************************************************/
645
.macro pixman_composite_over_8888_8888_process_pixblock_head
646
pixman_composite_out_reverse_8888_8888_process_pixblock_head
649
.macro pixman_composite_over_8888_8888_process_pixblock_tail
650
pixman_composite_out_reverse_8888_8888_process_pixblock_tail
651
vqadd.u8 q14, q0, q14
652
vqadd.u8 q15, q1, q15
655
.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
656
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
657
vrshr.u16 q14, q8, #8
658
PF add PF_X, PF_X, #8
660
vrshr.u16 q15, q9, #8
661
vrshr.u16 q12, q10, #8
662
vrshr.u16 q13, q11, #8
663
PF addne PF_X, PF_X, #8
664
PF subne PF_CTL, PF_CTL, #1
665
vraddhn.u16 d28, q14, q8
666
vraddhn.u16 d29, q15, q9
668
vraddhn.u16 d30, q12, q10
669
vraddhn.u16 d31, q13, q11
670
vqadd.u8 q14, q0, q14
671
vqadd.u8 q15, q1, q15
673
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
675
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
676
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
677
PF subge PF_X, PF_X, ORIG_W
679
PF subges PF_CTL, PF_CTL, #0x10
681
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
682
vmull.u8 q10, d22, d6
683
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
684
vmull.u8 q11, d22, d7
687
generate_composite_function \
688
pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
689
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
690
8, /* number of pixels, processed in a single block */ \
691
5, /* prefetch distance */ \
694
pixman_composite_over_8888_8888_process_pixblock_head, \
695
pixman_composite_over_8888_8888_process_pixblock_tail, \
696
pixman_composite_over_8888_8888_process_pixblock_tail_head
698
generate_composite_function_single_scanline \
699
pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
700
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
701
8, /* number of pixels, processed in a single block */ \
704
pixman_composite_over_8888_8888_process_pixblock_head, \
705
pixman_composite_over_8888_8888_process_pixblock_tail, \
706
pixman_composite_over_8888_8888_process_pixblock_tail_head
708
/******************************************************************************/
710
.macro pixman_composite_over_n_8888_process_pixblock_head
711
/* deinterleaved source pixels in {d0, d1, d2, d3} */
712
/* inverted alpha in {d24} */
713
/* destination pixels in {d4, d5, d6, d7} */
716
vmull.u8 q10, d24, d6
717
vmull.u8 q11, d24, d7
720
.macro pixman_composite_over_n_8888_process_pixblock_tail
721
vrshr.u16 q14, q8, #8
722
vrshr.u16 q15, q9, #8
723
vrshr.u16 q2, q10, #8
724
vrshr.u16 q3, q11, #8
725
vraddhn.u16 d28, q14, q8
726
vraddhn.u16 d29, q15, q9
727
vraddhn.u16 d30, q2, q10
728
vraddhn.u16 d31, q3, q11
729
vqadd.u8 q14, q0, q14
730
vqadd.u8 q15, q1, q15
733
.macro pixman_composite_over_n_8888_process_pixblock_tail_head
734
vrshr.u16 q14, q8, #8
735
vrshr.u16 q15, q9, #8
736
vrshr.u16 q2, q10, #8
737
vrshr.u16 q3, q11, #8
738
vraddhn.u16 d28, q14, q8
739
vraddhn.u16 d29, q15, q9
740
vraddhn.u16 d30, q2, q10
741
vraddhn.u16 d31, q3, q11
742
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
743
vqadd.u8 q14, q0, q14
744
PF add PF_X, PF_X, #8
746
PF addne PF_X, PF_X, #8
747
PF subne PF_CTL, PF_CTL, #1
748
vqadd.u8 q15, q1, q15
751
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
753
PF subge PF_X, PF_X, ORIG_W
754
vmull.u8 q10, d24, d6
755
PF subges PF_CTL, PF_CTL, #0x10
756
vmull.u8 q11, d24, d7
757
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
758
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
761
.macro pixman_composite_over_n_8888_init
762
add DUMMY, sp, #ARGS_STACK_OFFSET
763
vld1.32 {d3[0]}, [DUMMY]
768
vmvn.8 d24, d3 /* get inverted alpha */
771
generate_composite_function \
772
pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
773
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
774
8, /* number of pixels, processed in a single block */ \
775
5, /* prefetch distance */ \
776
pixman_composite_over_n_8888_init, \
778
pixman_composite_over_8888_8888_process_pixblock_head, \
779
pixman_composite_over_8888_8888_process_pixblock_tail, \
780
pixman_composite_over_n_8888_process_pixblock_tail_head
782
/******************************************************************************/
784
.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
785
vrshr.u16 q14, q8, #8
786
PF add PF_X, PF_X, #8
788
vrshr.u16 q15, q9, #8
789
vrshr.u16 q12, q10, #8
790
vrshr.u16 q13, q11, #8
791
PF addne PF_X, PF_X, #8
792
PF subne PF_CTL, PF_CTL, #1
793
vraddhn.u16 d28, q14, q8
794
vraddhn.u16 d29, q15, q9
796
vraddhn.u16 d30, q12, q10
797
vraddhn.u16 d31, q13, q11
798
vqadd.u8 q14, q0, q14
799
vqadd.u8 q15, q1, q15
800
vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
802
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
803
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
804
PF subge PF_X, PF_X, ORIG_W
806
PF subges PF_CTL, PF_CTL, #0x10
808
vmull.u8 q10, d22, d6
809
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
810
vmull.u8 q11, d22, d7
813
.macro pixman_composite_over_reverse_n_8888_init
814
add DUMMY, sp, #ARGS_STACK_OFFSET
815
vld1.32 {d7[0]}, [DUMMY]
822
generate_composite_function \
823
pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
824
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
825
8, /* number of pixels, processed in a single block */ \
826
5, /* prefetch distance */ \
827
pixman_composite_over_reverse_n_8888_init, \
829
pixman_composite_over_8888_8888_process_pixblock_head, \
830
pixman_composite_over_8888_8888_process_pixblock_tail, \
831
pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
832
28, /* dst_w_basereg */ \
833
0, /* dst_r_basereg */ \
834
4, /* src_basereg */ \
835
24 /* mask_basereg */
837
/******************************************************************************/
839
.macro pixman_composite_over_8888_8_0565_process_pixblock_head
840
vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
842
vmull.u8 q6, d24, d10
843
vmull.u8 q7, d24, d11
844
vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
847
vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
849
vrshr.u16 q10, q6, #8
850
vrshr.u16 q11, q7, #8
851
vraddhn.u16 d0, q0, q8
852
vraddhn.u16 d1, q1, q9
853
vraddhn.u16 d2, q6, q10
854
vraddhn.u16 d3, q7, q11
855
vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
858
vshrn.u16 d30, q2, #2
859
vmull.u8 q8, d3, d6 /* now do alpha blending */
861
vmull.u8 q10, d3, d30
864
.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
865
/* 3 cycle bubble (after vmull.u8) */
866
vrshr.u16 q13, q8, #8
867
vrshr.u16 q11, q9, #8
868
vrshr.u16 q15, q10, #8
869
vraddhn.u16 d16, q8, q13
870
vraddhn.u16 d27, q9, q11
871
vraddhn.u16 d26, q10, q15
872
vqadd.u8 d16, d2, d16
875
vshll.u8 q14, d16, #8 /* convert to 16bpp */
880
vsri.u16 q14, q9, #11
883
.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
884
vld1.16 {d4, d5}, [DST_R, :128]!
889
vmull.u8 q6, d24, d10
890
vrshr.u16 q13, q8, #8
891
vrshr.u16 q11, q9, #8
892
vrshr.u16 q15, q10, #8
893
vraddhn.u16 d16, q8, q13
894
vraddhn.u16 d27, q9, q11
895
vraddhn.u16 d26, q10, q15
896
vqadd.u8 d16, d2, d16
899
vshll.u8 q14, d16, #8
904
vmull.u8 q7, d24, d11
905
vsri.u16 q14, q9, #11
912
vrshr.u16 q10, q6, #8
913
vrshr.u16 q11, q7, #8
914
vraddhn.u16 d0, q0, q8
915
vraddhn.u16 d1, q1, q9
916
vraddhn.u16 d2, q6, q10
917
vraddhn.u16 d3, q7, q11
921
vshrn.u16 d30, q2, #2
922
vst1.16 {d28, d29}, [DST_W, :128]!
925
vmull.u8 q10, d3, d30
928
generate_composite_function \
929
pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
930
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
931
8, /* number of pixels, processed in a single block */ \
932
5, /* prefetch distance */ \
933
default_init_need_all_regs, \
934
default_cleanup_need_all_regs, \
935
pixman_composite_over_8888_8_0565_process_pixblock_head, \
936
pixman_composite_over_8888_8_0565_process_pixblock_tail, \
937
pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
938
28, /* dst_w_basereg */ \
939
4, /* dst_r_basereg */ \
940
8, /* src_basereg */ \
941
24 /* mask_basereg */
943
/******************************************************************************/
946
* This function needs a special initialization of solid mask.
947
* Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
948
* offset, split into color components and replicated in d8-d11
949
* registers. Additionally, this function needs all the NEON registers,
950
* so it has to save d8-d15 registers which are callee saved according
951
* to ABI. These registers are restored from 'cleanup' macro. All the
952
* other NEON registers are caller saved, so can be clobbered freely
953
* without introducing any problems.
955
.macro pixman_composite_over_n_8_0565_init
956
add DUMMY, sp, #ARGS_STACK_OFFSET
958
vld1.32 {d11[0]}, [DUMMY]
965
.macro pixman_composite_over_n_8_0565_cleanup
969
generate_composite_function \
970
pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
971
FLAG_DST_READWRITE, \
972
8, /* number of pixels, processed in a single block */ \
973
5, /* prefetch distance */ \
974
pixman_composite_over_n_8_0565_init, \
975
pixman_composite_over_n_8_0565_cleanup, \
976
pixman_composite_over_8888_8_0565_process_pixblock_head, \
977
pixman_composite_over_8888_8_0565_process_pixblock_tail, \
978
pixman_composite_over_8888_8_0565_process_pixblock_tail_head
980
/******************************************************************************/
982
.macro pixman_composite_over_8888_n_0565_init
983
add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
985
vld1.32 {d24[0]}, [DUMMY]
989
.macro pixman_composite_over_8888_n_0565_cleanup
993
generate_composite_function \
994
pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
995
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
996
8, /* number of pixels, processed in a single block */ \
997
5, /* prefetch distance */ \
998
pixman_composite_over_8888_n_0565_init, \
999
pixman_composite_over_8888_n_0565_cleanup, \
1000
pixman_composite_over_8888_8_0565_process_pixblock_head, \
1001
pixman_composite_over_8888_8_0565_process_pixblock_tail, \
1002
pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1003
28, /* dst_w_basereg */ \
1004
4, /* dst_r_basereg */ \
1005
8, /* src_basereg */ \
1006
24 /* mask_basereg */
1008
/******************************************************************************/
1010
.macro pixman_composite_src_0565_0565_process_pixblock_head
1013
.macro pixman_composite_src_0565_0565_process_pixblock_tail
1016
.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1017
vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1019
cache_preload 16, 16
1022
generate_composite_function \
1023
pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
1024
FLAG_DST_WRITEONLY, \
1025
16, /* number of pixels, processed in a single block */ \
1026
10, /* prefetch distance */ \
1029
pixman_composite_src_0565_0565_process_pixblock_head, \
1030
pixman_composite_src_0565_0565_process_pixblock_tail, \
1031
pixman_composite_src_0565_0565_process_pixblock_tail_head, \
1032
0, /* dst_w_basereg */ \
1033
0, /* dst_r_basereg */ \
1034
0, /* src_basereg */ \
1035
0 /* mask_basereg */
1037
/******************************************************************************/
1039
.macro pixman_composite_src_n_8_process_pixblock_head
1042
.macro pixman_composite_src_n_8_process_pixblock_tail
1045
.macro pixman_composite_src_n_8_process_pixblock_tail_head
1046
vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1049
.macro pixman_composite_src_n_8_init
1050
add DUMMY, sp, #ARGS_STACK_OFFSET
1051
vld1.32 {d0[0]}, [DUMMY]
1053
vsli.u64 d0, d0, #16
1054
vsli.u64 d0, d0, #32
1059
.macro pixman_composite_src_n_8_cleanup
1062
generate_composite_function \
1063
pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1064
FLAG_DST_WRITEONLY, \
1065
32, /* number of pixels, processed in a single block */ \
1066
0, /* prefetch distance */ \
1067
pixman_composite_src_n_8_init, \
1068
pixman_composite_src_n_8_cleanup, \
1069
pixman_composite_src_n_8_process_pixblock_head, \
1070
pixman_composite_src_n_8_process_pixblock_tail, \
1071
pixman_composite_src_n_8_process_pixblock_tail_head, \
1072
0, /* dst_w_basereg */ \
1073
0, /* dst_r_basereg */ \
1074
0, /* src_basereg */ \
1075
0 /* mask_basereg */
1077
/******************************************************************************/
1079
.macro pixman_composite_src_n_0565_process_pixblock_head
1082
.macro pixman_composite_src_n_0565_process_pixblock_tail
1085
.macro pixman_composite_src_n_0565_process_pixblock_tail_head
1086
vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1089
.macro pixman_composite_src_n_0565_init
1090
add DUMMY, sp, #ARGS_STACK_OFFSET
1091
vld1.32 {d0[0]}, [DUMMY]
1092
vsli.u64 d0, d0, #16
1093
vsli.u64 d0, d0, #32
1098
.macro pixman_composite_src_n_0565_cleanup
1101
generate_composite_function \
1102
pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1103
FLAG_DST_WRITEONLY, \
1104
16, /* number of pixels, processed in a single block */ \
1105
0, /* prefetch distance */ \
1106
pixman_composite_src_n_0565_init, \
1107
pixman_composite_src_n_0565_cleanup, \
1108
pixman_composite_src_n_0565_process_pixblock_head, \
1109
pixman_composite_src_n_0565_process_pixblock_tail, \
1110
pixman_composite_src_n_0565_process_pixblock_tail_head, \
1111
0, /* dst_w_basereg */ \
1112
0, /* dst_r_basereg */ \
1113
0, /* src_basereg */ \
1114
0 /* mask_basereg */
1116
/******************************************************************************/
1118
.macro pixman_composite_src_n_8888_process_pixblock_head
1121
.macro pixman_composite_src_n_8888_process_pixblock_tail
1124
.macro pixman_composite_src_n_8888_process_pixblock_tail_head
1125
vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1128
.macro pixman_composite_src_n_8888_init
1129
add DUMMY, sp, #ARGS_STACK_OFFSET
1130
vld1.32 {d0[0]}, [DUMMY]
1131
vsli.u64 d0, d0, #32
1136
.macro pixman_composite_src_n_8888_cleanup
1139
generate_composite_function \
1140
pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1141
FLAG_DST_WRITEONLY, \
1142
8, /* number of pixels, processed in a single block */ \
1143
0, /* prefetch distance */ \
1144
pixman_composite_src_n_8888_init, \
1145
pixman_composite_src_n_8888_cleanup, \
1146
pixman_composite_src_n_8888_process_pixblock_head, \
1147
pixman_composite_src_n_8888_process_pixblock_tail, \
1148
pixman_composite_src_n_8888_process_pixblock_tail_head, \
1149
0, /* dst_w_basereg */ \
1150
0, /* dst_r_basereg */ \
1151
0, /* src_basereg */ \
1152
0 /* mask_basereg */
1154
/******************************************************************************/
1156
.macro pixman_composite_src_8888_8888_process_pixblock_head
1159
.macro pixman_composite_src_8888_8888_process_pixblock_tail
1162
.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1163
vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1168
generate_composite_function \
1169
pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1170
FLAG_DST_WRITEONLY, \
1171
8, /* number of pixels, processed in a single block */ \
1172
10, /* prefetch distance */ \
1175
pixman_composite_src_8888_8888_process_pixblock_head, \
1176
pixman_composite_src_8888_8888_process_pixblock_tail, \
1177
pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1178
0, /* dst_w_basereg */ \
1179
0, /* dst_r_basereg */ \
1180
0, /* src_basereg */ \
1181
0 /* mask_basereg */
1183
/******************************************************************************/
1185
.macro pixman_composite_src_x888_8888_process_pixblock_head
1190
.macro pixman_composite_src_x888_8888_process_pixblock_tail
1193
.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1194
vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1201
.macro pixman_composite_src_x888_8888_init
1203
vshl.u32 q2, q2, #24
1206
generate_composite_function \
1207
pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1208
FLAG_DST_WRITEONLY, \
1209
8, /* number of pixels, processed in a single block */ \
1210
10, /* prefetch distance */ \
1211
pixman_composite_src_x888_8888_init, \
1213
pixman_composite_src_x888_8888_process_pixblock_head, \
1214
pixman_composite_src_x888_8888_process_pixblock_tail, \
1215
pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1216
0, /* dst_w_basereg */ \
1217
0, /* dst_r_basereg */ \
1218
0, /* src_basereg */ \
1219
0 /* mask_basereg */
1221
/******************************************************************************/
1223
.macro pixman_composite_src_n_8_8888_process_pixblock_head
1224
/* expecting solid source in {d0, d1, d2, d3} */
1225
/* mask is in d24 (d25, d26, d27 are unused) */
1228
vmull.u8 q8, d24, d0
1229
vmull.u8 q9, d24, d1
1230
vmull.u8 q10, d24, d2
1231
vmull.u8 q11, d24, d3
1232
vrsra.u16 q8, q8, #8
1233
vrsra.u16 q9, q9, #8
1234
vrsra.u16 q10, q10, #8
1235
vrsra.u16 q11, q11, #8
1238
.macro pixman_composite_src_n_8_8888_process_pixblock_tail
1239
vrshrn.u16 d28, q8, #8
1240
vrshrn.u16 d29, q9, #8
1241
vrshrn.u16 d30, q10, #8
1242
vrshrn.u16 d31, q11, #8
1245
.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1247
PF add PF_X, PF_X, #8
1248
vrshrn.u16 d28, q8, #8
1249
PF tst PF_CTL, #0x0F
1250
vrshrn.u16 d29, q9, #8
1251
PF addne PF_X, PF_X, #8
1252
vrshrn.u16 d30, q10, #8
1253
PF subne PF_CTL, PF_CTL, #1
1254
vrshrn.u16 d31, q11, #8
1256
vmull.u8 q8, d24, d0
1257
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1258
vmull.u8 q9, d24, d1
1259
PF subge PF_X, PF_X, ORIG_W
1260
vmull.u8 q10, d24, d2
1261
PF subges PF_CTL, PF_CTL, #0x10
1262
vmull.u8 q11, d24, d3
1263
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1264
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1265
vrsra.u16 q8, q8, #8
1266
vrsra.u16 q9, q9, #8
1267
vrsra.u16 q10, q10, #8
1268
vrsra.u16 q11, q11, #8
1271
.macro pixman_composite_src_n_8_8888_init
1272
add DUMMY, sp, #ARGS_STACK_OFFSET
1273
vld1.32 {d3[0]}, [DUMMY]
1280
.macro pixman_composite_src_n_8_8888_cleanup
1283
generate_composite_function \
1284
pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
1285
FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1286
8, /* number of pixels, processed in a single block */ \
1287
5, /* prefetch distance */ \
1288
pixman_composite_src_n_8_8888_init, \
1289
pixman_composite_src_n_8_8888_cleanup, \
1290
pixman_composite_src_n_8_8888_process_pixblock_head, \
1291
pixman_composite_src_n_8_8888_process_pixblock_tail, \
1292
pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
1294
/******************************************************************************/
1296
.macro pixman_composite_src_n_8_8_process_pixblock_head
1297
vmull.u8 q0, d24, d16
1298
vmull.u8 q1, d25, d16
1299
vmull.u8 q2, d26, d16
1300
vmull.u8 q3, d27, d16
1301
vrsra.u16 q0, q0, #8
1302
vrsra.u16 q1, q1, #8
1303
vrsra.u16 q2, q2, #8
1304
vrsra.u16 q3, q3, #8
1307
.macro pixman_composite_src_n_8_8_process_pixblock_tail
1308
vrshrn.u16 d28, q0, #8
1309
vrshrn.u16 d29, q1, #8
1310
vrshrn.u16 d30, q2, #8
1311
vrshrn.u16 d31, q3, #8
1314
.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1316
PF add PF_X, PF_X, #8
1317
vrshrn.u16 d28, q0, #8
1318
PF tst PF_CTL, #0x0F
1319
vrshrn.u16 d29, q1, #8
1320
PF addne PF_X, PF_X, #8
1321
vrshrn.u16 d30, q2, #8
1322
PF subne PF_CTL, PF_CTL, #1
1323
vrshrn.u16 d31, q3, #8
1325
vmull.u8 q0, d24, d16
1326
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1327
vmull.u8 q1, d25, d16
1328
PF subge PF_X, PF_X, ORIG_W
1329
vmull.u8 q2, d26, d16
1330
PF subges PF_CTL, PF_CTL, #0x10
1331
vmull.u8 q3, d27, d16
1332
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1333
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1334
vrsra.u16 q0, q0, #8
1335
vrsra.u16 q1, q1, #8
1336
vrsra.u16 q2, q2, #8
1337
vrsra.u16 q3, q3, #8
1340
.macro pixman_composite_src_n_8_8_init
1341
add DUMMY, sp, #ARGS_STACK_OFFSET
1342
vld1.32 {d16[0]}, [DUMMY]
1346
.macro pixman_composite_src_n_8_8_cleanup
1349
generate_composite_function \
1350
pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
1351
FLAG_DST_WRITEONLY, \
1352
32, /* number of pixels, processed in a single block */ \
1353
5, /* prefetch distance */ \
1354
pixman_composite_src_n_8_8_init, \
1355
pixman_composite_src_n_8_8_cleanup, \
1356
pixman_composite_src_n_8_8_process_pixblock_head, \
1357
pixman_composite_src_n_8_8_process_pixblock_tail, \
1358
pixman_composite_src_n_8_8_process_pixblock_tail_head
1360
/******************************************************************************/
1362
.macro pixman_composite_over_n_8_8888_process_pixblock_head
1363
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
1364
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1365
/* and destination data in {d4, d5, d6, d7} */
1366
/* mask is in d24 (d25, d26, d27 are unused) */
1369
vmull.u8 q6, d24, d8
1370
vmull.u8 q7, d24, d9
1371
vmull.u8 q8, d24, d10
1372
vmull.u8 q9, d24, d11
1373
vrshr.u16 q10, q6, #8
1374
vrshr.u16 q11, q7, #8
1375
vrshr.u16 q12, q8, #8
1376
vrshr.u16 q13, q9, #8
1377
vraddhn.u16 d0, q6, q10
1378
vraddhn.u16 d1, q7, q11
1379
vraddhn.u16 d2, q8, q12
1380
vraddhn.u16 d3, q9, q13
1381
vmvn.8 d25, d3 /* get inverted alpha */
1382
/* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1383
/* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1384
/* now do alpha blending */
1385
vmull.u8 q8, d25, d4
1386
vmull.u8 q9, d25, d5
1387
vmull.u8 q10, d25, d6
1388
vmull.u8 q11, d25, d7
1391
.macro pixman_composite_over_n_8_8888_process_pixblock_tail
1392
vrshr.u16 q14, q8, #8
1393
vrshr.u16 q15, q9, #8
1394
vrshr.u16 q6, q10, #8
1395
vrshr.u16 q7, q11, #8
1396
vraddhn.u16 d28, q14, q8
1397
vraddhn.u16 d29, q15, q9
1398
vraddhn.u16 d30, q6, q10
1399
vraddhn.u16 d31, q7, q11
1400
vqadd.u8 q14, q0, q14
1401
vqadd.u8 q15, q1, q15
1404
.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1405
vrshr.u16 q14, q8, #8
1406
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1407
vrshr.u16 q15, q9, #8
1409
vrshr.u16 q6, q10, #8
1410
PF add PF_X, PF_X, #8
1411
vrshr.u16 q7, q11, #8
1412
PF tst PF_CTL, #0x0F
1413
vraddhn.u16 d28, q14, q8
1414
PF addne PF_X, PF_X, #8
1415
vraddhn.u16 d29, q15, q9
1416
PF subne PF_CTL, PF_CTL, #1
1417
vraddhn.u16 d30, q6, q10
1419
vraddhn.u16 d31, q7, q11
1420
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1421
vmull.u8 q6, d24, d8
1422
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1423
vmull.u8 q7, d24, d9
1424
PF subge PF_X, PF_X, ORIG_W
1425
vmull.u8 q8, d24, d10
1426
PF subges PF_CTL, PF_CTL, #0x10
1427
vmull.u8 q9, d24, d11
1428
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1429
vqadd.u8 q14, q0, q14
1430
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1431
vqadd.u8 q15, q1, q15
1432
vrshr.u16 q10, q6, #8
1433
vrshr.u16 q11, q7, #8
1434
vrshr.u16 q12, q8, #8
1435
vrshr.u16 q13, q9, #8
1436
vraddhn.u16 d0, q6, q10
1437
vraddhn.u16 d1, q7, q11
1438
vraddhn.u16 d2, q8, q12
1439
vraddhn.u16 d3, q9, q13
1440
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1442
vmull.u8 q8, d25, d4
1443
vmull.u8 q9, d25, d5
1444
vmull.u8 q10, d25, d6
1445
vmull.u8 q11, d25, d7
1448
.macro pixman_composite_over_n_8_8888_init
1449
add DUMMY, sp, #ARGS_STACK_OFFSET
1451
vld1.32 {d11[0]}, [DUMMY]
1458
.macro pixman_composite_over_n_8_8888_cleanup
1462
generate_composite_function \
1463
pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1464
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1465
8, /* number of pixels, processed in a single block */ \
1466
5, /* prefetch distance */ \
1467
pixman_composite_over_n_8_8888_init, \
1468
pixman_composite_over_n_8_8888_cleanup, \
1469
pixman_composite_over_n_8_8888_process_pixblock_head, \
1470
pixman_composite_over_n_8_8888_process_pixblock_tail, \
1471
pixman_composite_over_n_8_8888_process_pixblock_tail_head
1473
/******************************************************************************/
1475
.macro pixman_composite_over_n_8_8_process_pixblock_head
1476
vmull.u8 q0, d24, d8
1477
vmull.u8 q1, d25, d8
1478
vmull.u8 q6, d26, d8
1479
vmull.u8 q7, d27, d8
1480
vrshr.u16 q10, q0, #8
1481
vrshr.u16 q11, q1, #8
1482
vrshr.u16 q12, q6, #8
1483
vrshr.u16 q13, q7, #8
1484
vraddhn.u16 d0, q0, q10
1485
vraddhn.u16 d1, q1, q11
1486
vraddhn.u16 d2, q6, q12
1487
vraddhn.u16 d3, q7, q13
1490
vmull.u8 q8, d24, d4
1491
vmull.u8 q9, d25, d5
1492
vmull.u8 q10, d26, d6
1493
vmull.u8 q11, d27, d7
1496
.macro pixman_composite_over_n_8_8_process_pixblock_tail
1497
vrshr.u16 q14, q8, #8
1498
vrshr.u16 q15, q9, #8
1499
vrshr.u16 q12, q10, #8
1500
vrshr.u16 q13, q11, #8
1501
vraddhn.u16 d28, q14, q8
1502
vraddhn.u16 d29, q15, q9
1503
vraddhn.u16 d30, q12, q10
1504
vraddhn.u16 d31, q13, q11
1505
vqadd.u8 q14, q0, q14
1506
vqadd.u8 q15, q1, q15
1509
/* TODO: expand macros and do better instructions scheduling */
1510
.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1511
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1512
pixman_composite_over_n_8_8_process_pixblock_tail
1514
cache_preload 32, 32
1515
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1516
pixman_composite_over_n_8_8_process_pixblock_head
1519
.macro pixman_composite_over_n_8_8_init
1520
add DUMMY, sp, #ARGS_STACK_OFFSET
1522
vld1.32 {d8[0]}, [DUMMY]
1526
.macro pixman_composite_over_n_8_8_cleanup
1530
generate_composite_function \
1531
pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1532
FLAG_DST_READWRITE, \
1533
32, /* number of pixels, processed in a single block */ \
1534
5, /* prefetch distance */ \
1535
pixman_composite_over_n_8_8_init, \
1536
pixman_composite_over_n_8_8_cleanup, \
1537
pixman_composite_over_n_8_8_process_pixblock_head, \
1538
pixman_composite_over_n_8_8_process_pixblock_tail, \
1539
pixman_composite_over_n_8_8_process_pixblock_tail_head
1541
/******************************************************************************/
1543
.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1545
* 'combine_mask_ca' replacement
1547
* input: solid src (n) in {d8, d9, d10, d11}
1548
* dest in {d4, d5, d6, d7 }
1549
* mask in {d24, d25, d26, d27}
1550
* output: updated src in {d0, d1, d2, d3 }
1551
* updated mask in {d24, d25, d26, d3 }
1553
vmull.u8 q0, d24, d8
1554
vmull.u8 q1, d25, d9
1555
vmull.u8 q6, d26, d10
1556
vmull.u8 q7, d27, d11
1557
vmull.u8 q9, d11, d25
1558
vmull.u8 q12, d11, d24
1559
vmull.u8 q13, d11, d26
1560
vrshr.u16 q8, q0, #8
1561
vrshr.u16 q10, q1, #8
1562
vrshr.u16 q11, q6, #8
1563
vraddhn.u16 d0, q0, q8
1564
vraddhn.u16 d1, q1, q10
1565
vraddhn.u16 d2, q6, q11
1566
vrshr.u16 q11, q12, #8
1567
vrshr.u16 q8, q9, #8
1568
vrshr.u16 q6, q13, #8
1569
vrshr.u16 q10, q7, #8
1570
vraddhn.u16 d24, q12, q11
1571
vraddhn.u16 d25, q9, q8
1572
vraddhn.u16 d26, q13, q6
1573
vraddhn.u16 d3, q7, q10
1575
* 'combine_over_ca' replacement
1577
* output: updated dest in {d28, d29, d30, d31}
1581
vmull.u8 q8, d24, d4
1582
vmull.u8 q9, d25, d5
1584
vmull.u8 q10, d26, d6
1585
vmull.u8 q11, d27, d7
1588
.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1589
/* ... continue 'combine_over_ca' replacement */
1590
vrshr.u16 q14, q8, #8
1591
vrshr.u16 q15, q9, #8
1592
vrshr.u16 q6, q10, #8
1593
vrshr.u16 q7, q11, #8
1594
vraddhn.u16 d28, q14, q8
1595
vraddhn.u16 d29, q15, q9
1596
vraddhn.u16 d30, q6, q10
1597
vraddhn.u16 d31, q7, q11
1598
vqadd.u8 q14, q0, q14
1599
vqadd.u8 q15, q1, q15
1602
.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1603
vrshr.u16 q14, q8, #8
1604
vrshr.u16 q15, q9, #8
1605
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1606
vrshr.u16 q6, q10, #8
1607
vrshr.u16 q7, q11, #8
1608
vraddhn.u16 d28, q14, q8
1609
vraddhn.u16 d29, q15, q9
1610
vraddhn.u16 d30, q6, q10
1611
vraddhn.u16 d31, q7, q11
1613
vqadd.u8 q14, q0, q14
1614
vqadd.u8 q15, q1, q15
1616
pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1617
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1620
.macro pixman_composite_over_n_8888_8888_ca_init
1621
add DUMMY, sp, #ARGS_STACK_OFFSET
1623
vld1.32 {d11[0]}, [DUMMY]
1630
.macro pixman_composite_over_n_8888_8888_ca_cleanup
1634
generate_composite_function \
1635
pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1636
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1637
8, /* number of pixels, processed in a single block */ \
1638
5, /* prefetch distance */ \
1639
pixman_composite_over_n_8888_8888_ca_init, \
1640
pixman_composite_over_n_8888_8888_ca_cleanup, \
1641
pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1642
pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1643
pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1645
/******************************************************************************/
1647
.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1649
* 'combine_mask_ca' replacement
1651
* input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1652
* mask in {d24, d25, d26} [B, G, R]
1653
* output: updated src in {d0, d1, d2 } [B, G, R]
1654
* updated mask in {d24, d25, d26} [B, G, R]
1656
vmull.u8 q0, d24, d8
1657
vmull.u8 q1, d25, d9
1658
vmull.u8 q6, d26, d10
1659
vmull.u8 q9, d11, d25
1660
vmull.u8 q12, d11, d24
1661
vmull.u8 q13, d11, d26
1662
vrshr.u16 q8, q0, #8
1663
vrshr.u16 q10, q1, #8
1664
vrshr.u16 q11, q6, #8
1665
vraddhn.u16 d0, q0, q8
1666
vraddhn.u16 d1, q1, q10
1667
vraddhn.u16 d2, q6, q11
1668
vrshr.u16 q11, q12, #8
1669
vrshr.u16 q8, q9, #8
1670
vrshr.u16 q6, q13, #8
1671
vraddhn.u16 d24, q12, q11
1672
vraddhn.u16 d25, q9, q8
1674
* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1675
* and put data into d16 - blue, d17 - green, d18 - red
1677
vshrn.u16 d17, q2, #3
1678
vshrn.u16 d18, q2, #8
1679
vraddhn.u16 d26, q13, q6
1681
vsri.u8 d18, d18, #5
1682
vsri.u8 d17, d17, #6
1684
* 'combine_over_ca' replacement
1686
* output: updated dest in d16 - blue, d17 - green, d18 - red
1689
vshrn.u16 d16, q2, #2
1691
vmull.u8 q6, d16, d24
1692
vmull.u8 q7, d17, d25
1693
vmull.u8 q11, d18, d26
1696
.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1697
/* ... continue 'combine_over_ca' replacement */
1698
vrshr.u16 q10, q6, #8
1699
vrshr.u16 q14, q7, #8
1700
vrshr.u16 q15, q11, #8
1701
vraddhn.u16 d16, q10, q6
1702
vraddhn.u16 d17, q14, q7
1703
vraddhn.u16 d18, q15, q11
1705
vqadd.u8 d18, d2, d18
1707
* convert the results in d16, d17, d18 to r5g6b5 and store
1708
* them into {d28, d29}
1710
vshll.u8 q14, d18, #8
1711
vshll.u8 q10, d17, #8
1712
vshll.u8 q15, d16, #8
1713
vsri.u16 q14, q10, #5
1714
vsri.u16 q14, q15, #11
1717
.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1719
vrshr.u16 q10, q6, #8
1720
vrshr.u16 q14, q7, #8
1721
vld1.16 {d4, d5}, [DST_R, :128]!
1722
vrshr.u16 q15, q11, #8
1723
vraddhn.u16 d16, q10, q6
1724
vraddhn.u16 d17, q14, q7
1725
vraddhn.u16 d22, q15, q11
1726
/* process_pixblock_head */
1728
* 'combine_mask_ca' replacement
1730
* input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1731
* mask in {d24, d25, d26} [B, G, R]
1732
* output: updated src in {d0, d1, d2 } [B, G, R]
1733
* updated mask in {d24, d25, d26} [B, G, R]
1735
vmull.u8 q6, d26, d10
1737
vmull.u8 q0, d24, d8
1738
vqadd.u8 d22, d2, d22
1739
vmull.u8 q1, d25, d9
1741
* convert the result in d16, d17, d22 to r5g6b5 and store
1742
* it into {d28, d29}
1744
vshll.u8 q14, d22, #8
1745
vshll.u8 q10, d17, #8
1746
vshll.u8 q15, d16, #8
1747
vmull.u8 q9, d11, d25
1748
vsri.u16 q14, q10, #5
1749
vmull.u8 q12, d11, d24
1750
vmull.u8 q13, d11, d26
1751
vsri.u16 q14, q15, #11
1753
vrshr.u16 q8, q0, #8
1754
vrshr.u16 q10, q1, #8
1755
vrshr.u16 q11, q6, #8
1756
vraddhn.u16 d0, q0, q8
1757
vraddhn.u16 d1, q1, q10
1758
vraddhn.u16 d2, q6, q11
1759
vrshr.u16 q11, q12, #8
1760
vrshr.u16 q8, q9, #8
1761
vrshr.u16 q6, q13, #8
1762
vraddhn.u16 d24, q12, q11
1763
vraddhn.u16 d25, q9, q8
1765
* convert 8 r5g6b5 pixel data from {d4, d5} to planar
1766
* 8-bit format and put data into d16 - blue, d17 - green,
1769
vshrn.u16 d17, q2, #3
1770
vshrn.u16 d18, q2, #8
1771
vraddhn.u16 d26, q13, q6
1773
vsri.u8 d17, d17, #6
1774
vsri.u8 d18, d18, #5
1776
* 'combine_over_ca' replacement
1778
* output: updated dest in d16 - blue, d17 - green, d18 - red
1781
vshrn.u16 d16, q2, #2
1783
vmull.u8 q7, d17, d25
1784
vmull.u8 q6, d16, d24
1785
vmull.u8 q11, d18, d26
1786
vst1.16 {d28, d29}, [DST_W, :128]!
1789
.macro pixman_composite_over_n_8888_0565_ca_init
1790
add DUMMY, sp, #ARGS_STACK_OFFSET
1792
vld1.32 {d11[0]}, [DUMMY]
1799
.macro pixman_composite_over_n_8888_0565_ca_cleanup
1803
generate_composite_function \
1804
pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1805
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1806
8, /* number of pixels, processed in a single block */ \
1807
5, /* prefetch distance */ \
1808
pixman_composite_over_n_8888_0565_ca_init, \
1809
pixman_composite_over_n_8888_0565_ca_cleanup, \
1810
pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1811
pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1812
pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1814
/******************************************************************************/
1816
.macro pixman_composite_in_n_8_process_pixblock_head
1817
/* expecting source data in {d0, d1, d2, d3} */
1818
/* and destination data in {d4, d5, d6, d7} */
1821
vmull.u8 q10, d6, d3
1822
vmull.u8 q11, d7, d3
1825
.macro pixman_composite_in_n_8_process_pixblock_tail
1826
vrshr.u16 q14, q8, #8
1827
vrshr.u16 q15, q9, #8
1828
vrshr.u16 q12, q10, #8
1829
vrshr.u16 q13, q11, #8
1830
vraddhn.u16 d28, q8, q14
1831
vraddhn.u16 d29, q9, q15
1832
vraddhn.u16 d30, q10, q12
1833
vraddhn.u16 d31, q11, q13
1836
.macro pixman_composite_in_n_8_process_pixblock_tail_head
1837
pixman_composite_in_n_8_process_pixblock_tail
1838
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1839
cache_preload 32, 32
1840
pixman_composite_in_n_8_process_pixblock_head
1841
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1844
.macro pixman_composite_in_n_8_init
1845
add DUMMY, sp, #ARGS_STACK_OFFSET
1846
vld1.32 {d3[0]}, [DUMMY]
1850
.macro pixman_composite_in_n_8_cleanup
1853
generate_composite_function \
1854
pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1855
FLAG_DST_READWRITE, \
1856
32, /* number of pixels, processed in a single block */ \
1857
5, /* prefetch distance */ \
1858
pixman_composite_in_n_8_init, \
1859
pixman_composite_in_n_8_cleanup, \
1860
pixman_composite_in_n_8_process_pixblock_head, \
1861
pixman_composite_in_n_8_process_pixblock_tail, \
1862
pixman_composite_in_n_8_process_pixblock_tail_head, \
1863
28, /* dst_w_basereg */ \
1864
4, /* dst_r_basereg */ \
1865
0, /* src_basereg */ \
1866
24 /* mask_basereg */
1868
.macro pixman_composite_add_n_8_8_process_pixblock_head
1869
/* expecting source data in {d8, d9, d10, d11} */
1870
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1871
/* and destination data in {d4, d5, d6, d7} */
1872
/* mask is in d24, d25, d26, d27 */
1873
vmull.u8 q0, d24, d11
1874
vmull.u8 q1, d25, d11
1875
vmull.u8 q6, d26, d11
1876
vmull.u8 q7, d27, d11
1877
vrshr.u16 q10, q0, #8
1878
vrshr.u16 q11, q1, #8
1879
vrshr.u16 q12, q6, #8
1880
vrshr.u16 q13, q7, #8
1881
vraddhn.u16 d0, q0, q10
1882
vraddhn.u16 d1, q1, q11
1883
vraddhn.u16 d2, q6, q12
1884
vraddhn.u16 d3, q7, q13
1885
vqadd.u8 q14, q0, q2
1886
vqadd.u8 q15, q1, q3
1889
.macro pixman_composite_add_n_8_8_process_pixblock_tail
1892
/* TODO: expand macros and do better instructions scheduling */
1893
.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1894
pixman_composite_add_n_8_8_process_pixblock_tail
1895
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1896
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1898
cache_preload 32, 32
1899
pixman_composite_add_n_8_8_process_pixblock_head
1902
.macro pixman_composite_add_n_8_8_init
1903
add DUMMY, sp, #ARGS_STACK_OFFSET
1905
vld1.32 {d11[0]}, [DUMMY]
1909
.macro pixman_composite_add_n_8_8_cleanup
1913
generate_composite_function \
1914
pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1915
FLAG_DST_READWRITE, \
1916
32, /* number of pixels, processed in a single block */ \
1917
5, /* prefetch distance */ \
1918
pixman_composite_add_n_8_8_init, \
1919
pixman_composite_add_n_8_8_cleanup, \
1920
pixman_composite_add_n_8_8_process_pixblock_head, \
1921
pixman_composite_add_n_8_8_process_pixblock_tail, \
1922
pixman_composite_add_n_8_8_process_pixblock_tail_head
1924
/******************************************************************************/
1926
.macro pixman_composite_add_8_8_8_process_pixblock_head
1927
/* expecting source data in {d0, d1, d2, d3} */
1928
/* destination data in {d4, d5, d6, d7} */
1929
/* mask in {d24, d25, d26, d27} */
1930
vmull.u8 q8, d24, d0
1931
vmull.u8 q9, d25, d1
1932
vmull.u8 q10, d26, d2
1933
vmull.u8 q11, d27, d3
1934
vrshr.u16 q0, q8, #8
1935
vrshr.u16 q1, q9, #8
1936
vrshr.u16 q12, q10, #8
1937
vrshr.u16 q13, q11, #8
1938
vraddhn.u16 d0, q0, q8
1939
vraddhn.u16 d1, q1, q9
1940
vraddhn.u16 d2, q12, q10
1941
vraddhn.u16 d3, q13, q11
1942
vqadd.u8 q14, q0, q2
1943
vqadd.u8 q15, q1, q3
1946
.macro pixman_composite_add_8_8_8_process_pixblock_tail
1949
/* TODO: expand macros and do better instructions scheduling */
1950
.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1951
pixman_composite_add_8_8_8_process_pixblock_tail
1952
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1953
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1956
cache_preload 32, 32
1957
pixman_composite_add_8_8_8_process_pixblock_head
1960
.macro pixman_composite_add_8_8_8_init
1963
.macro pixman_composite_add_8_8_8_cleanup
1966
generate_composite_function \
1967
pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1968
FLAG_DST_READWRITE, \
1969
32, /* number of pixels, processed in a single block */ \
1970
5, /* prefetch distance */ \
1971
pixman_composite_add_8_8_8_init, \
1972
pixman_composite_add_8_8_8_cleanup, \
1973
pixman_composite_add_8_8_8_process_pixblock_head, \
1974
pixman_composite_add_8_8_8_process_pixblock_tail, \
1975
pixman_composite_add_8_8_8_process_pixblock_tail_head
1977
/******************************************************************************/
1979
.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1980
/* expecting source data in {d0, d1, d2, d3} */
1981
/* destination data in {d4, d5, d6, d7} */
1982
/* mask in {d24, d25, d26, d27} */
1983
vmull.u8 q8, d27, d0
1984
vmull.u8 q9, d27, d1
1985
vmull.u8 q10, d27, d2
1986
vmull.u8 q11, d27, d3
1987
/* 1 cycle bubble */
1988
vrsra.u16 q8, q8, #8
1989
vrsra.u16 q9, q9, #8
1990
vrsra.u16 q10, q10, #8
1991
vrsra.u16 q11, q11, #8
1994
.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1995
/* 2 cycle bubble */
1996
vrshrn.u16 d28, q8, #8
1997
vrshrn.u16 d29, q9, #8
1998
vrshrn.u16 d30, q10, #8
1999
vrshrn.u16 d31, q11, #8
2000
vqadd.u8 q14, q2, q14
2001
/* 1 cycle bubble */
2002
vqadd.u8 q15, q3, q15
2005
.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2007
vrshrn.u16 d28, q8, #8
2009
vrshrn.u16 d29, q9, #8
2010
vmull.u8 q8, d27, d0
2011
vrshrn.u16 d30, q10, #8
2012
vmull.u8 q9, d27, d1
2013
vrshrn.u16 d31, q11, #8
2014
vmull.u8 q10, d27, d2
2015
vqadd.u8 q14, q2, q14
2016
vmull.u8 q11, d27, d3
2017
vqadd.u8 q15, q3, q15
2018
vrsra.u16 q8, q8, #8
2019
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2020
vrsra.u16 q9, q9, #8
2021
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2022
vrsra.u16 q10, q10, #8
2026
vrsra.u16 q11, q11, #8
2029
generate_composite_function \
2030
pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
2031
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2032
8, /* number of pixels, processed in a single block */ \
2033
10, /* prefetch distance */ \
2036
pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2037
pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2038
pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2040
generate_composite_function_single_scanline \
2041
pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
2042
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2043
8, /* number of pixels, processed in a single block */ \
2046
pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2047
pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2048
pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2050
/******************************************************************************/
2052
generate_composite_function \
2053
pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
2054
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2055
8, /* number of pixels, processed in a single block */ \
2056
5, /* prefetch distance */ \
2059
pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2060
pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2061
pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2062
28, /* dst_w_basereg */ \
2063
4, /* dst_r_basereg */ \
2064
0, /* src_basereg */ \
2065
27 /* mask_basereg */
2067
/******************************************************************************/
2069
.macro pixman_composite_add_n_8_8888_init
2070
add DUMMY, sp, #ARGS_STACK_OFFSET
2071
vld1.32 {d3[0]}, [DUMMY]
2078
.macro pixman_composite_add_n_8_8888_cleanup
2081
generate_composite_function \
2082
pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
2083
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2084
8, /* number of pixels, processed in a single block */ \
2085
5, /* prefetch distance */ \
2086
pixman_composite_add_n_8_8888_init, \
2087
pixman_composite_add_n_8_8888_cleanup, \
2088
pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2089
pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2090
pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2091
28, /* dst_w_basereg */ \
2092
4, /* dst_r_basereg */ \
2093
0, /* src_basereg */ \
2094
27 /* mask_basereg */
2096
/******************************************************************************/
2098
.macro pixman_composite_add_8888_n_8888_init
2099
add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2100
vld1.32 {d27[0]}, [DUMMY]
2104
.macro pixman_composite_add_8888_n_8888_cleanup
2107
generate_composite_function \
2108
pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
2109
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2110
8, /* number of pixels, processed in a single block */ \
2111
5, /* prefetch distance */ \
2112
pixman_composite_add_8888_n_8888_init, \
2113
pixman_composite_add_8888_n_8888_cleanup, \
2114
pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2115
pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2116
pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2117
28, /* dst_w_basereg */ \
2118
4, /* dst_r_basereg */ \
2119
0, /* src_basereg */ \
2120
27 /* mask_basereg */
2122
/******************************************************************************/
2124
.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2125
/* expecting source data in {d0, d1, d2, d3} */
2126
/* destination data in {d4, d5, d6, d7} */
2127
/* solid mask is in d15 */
2130
vmull.u8 q8, d15, d3
2131
vmull.u8 q6, d15, d2
2132
vmull.u8 q5, d15, d1
2133
vmull.u8 q4, d15, d0
2134
vrshr.u16 q13, q8, #8
2135
vrshr.u16 q12, q6, #8
2136
vrshr.u16 q11, q5, #8
2137
vrshr.u16 q10, q4, #8
2138
vraddhn.u16 d3, q8, q13
2139
vraddhn.u16 d2, q6, q12
2140
vraddhn.u16 d1, q5, q11
2141
vraddhn.u16 d0, q4, q10
2142
vmvn.8 d24, d3 /* get inverted alpha */
2143
/* now do alpha blending */
2144
vmull.u8 q8, d24, d4
2145
vmull.u8 q9, d24, d5
2146
vmull.u8 q10, d24, d6
2147
vmull.u8 q11, d24, d7
2150
.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2151
vrshr.u16 q14, q8, #8
2152
vrshr.u16 q15, q9, #8
2153
vrshr.u16 q12, q10, #8
2154
vrshr.u16 q13, q11, #8
2155
vraddhn.u16 d28, q14, q8
2156
vraddhn.u16 d29, q15, q9
2157
vraddhn.u16 d30, q12, q10
2158
vraddhn.u16 d31, q13, q11
2161
/* TODO: expand macros and do better instructions scheduling */
2162
.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2163
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2164
pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2168
pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2169
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2172
generate_composite_function_single_scanline \
2173
pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2174
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2175
8, /* number of pixels, processed in a single block */ \
2176
default_init_need_all_regs, \
2177
default_cleanup_need_all_regs, \
2178
pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
2179
pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
2180
pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
2181
28, /* dst_w_basereg */ \
2182
4, /* dst_r_basereg */ \
2183
0, /* src_basereg */ \
2184
12 /* mask_basereg */
2186
/******************************************************************************/
2188
.macro pixman_composite_over_8888_n_8888_process_pixblock_head
2189
pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2192
.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2193
pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2194
vqadd.u8 q14, q0, q14
2195
vqadd.u8 q15, q1, q15
2198
/* TODO: expand macros and do better instructions scheduling */
2199
.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2200
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2201
pixman_composite_over_8888_n_8888_process_pixblock_tail
2204
pixman_composite_over_8888_n_8888_process_pixblock_head
2205
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2208
.macro pixman_composite_over_8888_n_8888_init
2211
vld1.32 {d15[0]}, [DUMMY]
2215
.macro pixman_composite_over_8888_n_8888_cleanup
2219
generate_composite_function \
2220
pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2221
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2222
8, /* number of pixels, processed in a single block */ \
2223
5, /* prefetch distance */ \
2224
pixman_composite_over_8888_n_8888_init, \
2225
pixman_composite_over_8888_n_8888_cleanup, \
2226
pixman_composite_over_8888_n_8888_process_pixblock_head, \
2227
pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2228
pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2230
/******************************************************************************/
2232
/* TODO: expand macros and do better instructions scheduling */
2233
.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2234
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2235
pixman_composite_over_8888_n_8888_process_pixblock_tail
2239
pixman_composite_over_8888_n_8888_process_pixblock_head
2240
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2243
generate_composite_function \
2244
pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2245
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2246
8, /* number of pixels, processed in a single block */ \
2247
5, /* prefetch distance */ \
2248
default_init_need_all_regs, \
2249
default_cleanup_need_all_regs, \
2250
pixman_composite_over_8888_n_8888_process_pixblock_head, \
2251
pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2252
pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2253
28, /* dst_w_basereg */ \
2254
4, /* dst_r_basereg */ \
2255
0, /* src_basereg */ \
2256
12 /* mask_basereg */
2258
generate_composite_function_single_scanline \
2259
pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2260
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2261
8, /* number of pixels, processed in a single block */ \
2262
default_init_need_all_regs, \
2263
default_cleanup_need_all_regs, \
2264
pixman_composite_over_8888_n_8888_process_pixblock_head, \
2265
pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2266
pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2267
28, /* dst_w_basereg */ \
2268
4, /* dst_r_basereg */ \
2269
0, /* src_basereg */ \
2270
12 /* mask_basereg */
2272
/******************************************************************************/
2274
/* TODO: expand macros and do better instructions scheduling */
2275
.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2276
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2277
pixman_composite_over_8888_n_8888_process_pixblock_tail
2281
pixman_composite_over_8888_n_8888_process_pixblock_head
2282
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2285
generate_composite_function \
2286
pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2287
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2288
8, /* number of pixels, processed in a single block */ \
2289
5, /* prefetch distance */ \
2290
default_init_need_all_regs, \
2291
default_cleanup_need_all_regs, \
2292
pixman_composite_over_8888_n_8888_process_pixblock_head, \
2293
pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2294
pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2295
28, /* dst_w_basereg */ \
2296
4, /* dst_r_basereg */ \
2297
0, /* src_basereg */ \
2298
15 /* mask_basereg */
2300
/******************************************************************************/
2302
.macro pixman_composite_src_0888_0888_process_pixblock_head
2305
.macro pixman_composite_src_0888_0888_process_pixblock_tail
2308
.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2309
vst3.8 {d0, d1, d2}, [DST_W]!
2314
generate_composite_function \
2315
pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2316
FLAG_DST_WRITEONLY, \
2317
8, /* number of pixels, processed in a single block */ \
2318
10, /* prefetch distance */ \
2321
pixman_composite_src_0888_0888_process_pixblock_head, \
2322
pixman_composite_src_0888_0888_process_pixblock_tail, \
2323
pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2324
0, /* dst_w_basereg */ \
2325
0, /* dst_r_basereg */ \
2326
0, /* src_basereg */ \
2327
0 /* mask_basereg */
2329
/******************************************************************************/
2331
.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2335
.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2338
.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2339
vst4.8 {d0, d1, d2, d3}, [DST_W]!
2345
.macro pixman_composite_src_0888_8888_rev_init
2349
generate_composite_function \
2350
pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2351
FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2352
8, /* number of pixels, processed in a single block */ \
2353
10, /* prefetch distance */ \
2354
pixman_composite_src_0888_8888_rev_init, \
2356
pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2357
pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2358
pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2359
0, /* dst_w_basereg */ \
2360
0, /* dst_r_basereg */ \
2361
0, /* src_basereg */ \
2362
0 /* mask_basereg */
2364
/******************************************************************************/
2366
.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2371
.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2372
vshll.u8 q14, d0, #8
2373
vsri.u16 q14, q8, #5
2374
vsri.u16 q14, q9, #11
2377
.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2378
vshll.u8 q14, d0, #8
2380
vsri.u16 q14, q8, #5
2381
vsri.u16 q14, q9, #11
2383
vst1.16 {d28, d29}, [DST_W, :128]!
2387
generate_composite_function \
2388
pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2389
FLAG_DST_WRITEONLY, \
2390
8, /* number of pixels, processed in a single block */ \
2391
10, /* prefetch distance */ \
2394
pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2395
pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2396
pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2397
28, /* dst_w_basereg */ \
2398
0, /* dst_r_basereg */ \
2399
0, /* src_basereg */ \
2400
0 /* mask_basereg */
2402
/******************************************************************************/
2404
.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2407
vmull.u8 q10, d3, d2
2410
.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2411
vrshr.u16 q11, q8, #8
2413
vrshr.u16 q12, q9, #8
2414
vrshr.u16 q13, q10, #8
2415
vraddhn.u16 d30, q11, q8
2416
vraddhn.u16 d29, q12, q9
2417
vraddhn.u16 d28, q13, q10
2420
.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2421
vrshr.u16 q11, q8, #8
2423
vrshr.u16 q12, q9, #8
2424
vrshr.u16 q13, q10, #8
2426
vraddhn.u16 d30, q11, q8
2427
PF add PF_X, PF_X, #8
2429
PF addne PF_X, PF_X, #8
2430
PF subne PF_CTL, PF_CTL, #1
2431
vraddhn.u16 d29, q12, q9
2432
vraddhn.u16 d28, q13, q10
2435
vmull.u8 q10, d3, d2
2436
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2438
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2439
PF subge PF_X, PF_X, ORIG_W
2440
PF subges PF_CTL, PF_CTL, #0x10
2441
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2444
generate_composite_function \
2445
pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2446
FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2447
8, /* number of pixels, processed in a single block */ \
2448
10, /* prefetch distance */ \
2451
pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2452
pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2453
pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2454
28, /* dst_w_basereg */ \
2455
0, /* dst_r_basereg */ \
2456
0, /* src_basereg */ \
2457
0 /* mask_basereg */
2459
/******************************************************************************/
2461
.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2464
vmull.u8 q10, d3, d2
2467
.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2468
vrshr.u16 q11, q8, #8
2470
vrshr.u16 q12, q9, #8
2471
vrshr.u16 q13, q10, #8
2472
vraddhn.u16 d28, q11, q8
2473
vraddhn.u16 d29, q12, q9
2474
vraddhn.u16 d30, q13, q10
2477
.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2478
vrshr.u16 q11, q8, #8
2480
vrshr.u16 q12, q9, #8
2481
vrshr.u16 q13, q10, #8
2483
vraddhn.u16 d28, q11, q8
2484
PF add PF_X, PF_X, #8
2486
PF addne PF_X, PF_X, #8
2487
PF subne PF_CTL, PF_CTL, #1
2488
vraddhn.u16 d29, q12, q9
2489
vraddhn.u16 d30, q13, q10
2492
vmull.u8 q10, d3, d2
2493
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2495
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2496
PF subge PF_X, PF_X, ORIG_W
2497
PF subges PF_CTL, PF_CTL, #0x10
2498
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2501
generate_composite_function \
2502
pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2503
FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2504
8, /* number of pixels, processed in a single block */ \
2505
10, /* prefetch distance */ \
2508
pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2509
pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2510
pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2511
28, /* dst_w_basereg */ \
2512
0, /* dst_r_basereg */ \
2513
0, /* src_basereg */ \
2514
0 /* mask_basereg */
2516
/******************************************************************************/
2518
.macro pixman_composite_over_0565_8_0565_process_pixblock_head
2519
/* mask is in d15 */
2520
convert_0565_to_x888 q4, d2, d1, d0
2521
convert_0565_to_x888 q5, d6, d5, d4
2522
/* source pixel data is in {d0, d1, d2, XX} */
2523
/* destination pixel data is in {d4, d5, d6, XX} */
2525
vmull.u8 q6, d15, d2
2526
vmull.u8 q5, d15, d1
2527
vmull.u8 q4, d15, d0
2530
vmull.u8 q13, d7, d6
2531
vrshr.u16 q12, q6, #8
2532
vrshr.u16 q11, q5, #8
2533
vrshr.u16 q10, q4, #8
2534
vraddhn.u16 d2, q6, q12
2535
vraddhn.u16 d1, q5, q11
2536
vraddhn.u16 d0, q4, q10
2539
.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2540
vrshr.u16 q14, q8, #8
2541
vrshr.u16 q15, q9, #8
2542
vrshr.u16 q12, q13, #8
2543
vraddhn.u16 d28, q14, q8
2544
vraddhn.u16 d29, q15, q9
2545
vraddhn.u16 d30, q12, q13
2546
vqadd.u8 q0, q0, q14
2547
vqadd.u8 q1, q1, q15
2548
/* 32bpp result is in {d0, d1, d2, XX} */
2549
convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2552
/* TODO: expand macros and do better instructions scheduling */
2553
.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2555
pixman_composite_over_0565_8_0565_process_pixblock_tail
2557
vld1.16 {d10, d11}, [DST_R, :128]!
2559
pixman_composite_over_0565_8_0565_process_pixblock_head
2560
vst1.16 {d28, d29}, [DST_W, :128]!
2563
generate_composite_function \
2564
pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2565
FLAG_DST_READWRITE, \
2566
8, /* number of pixels, processed in a single block */ \
2567
5, /* prefetch distance */ \
2568
default_init_need_all_regs, \
2569
default_cleanup_need_all_regs, \
2570
pixman_composite_over_0565_8_0565_process_pixblock_head, \
2571
pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2572
pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2573
28, /* dst_w_basereg */ \
2574
10, /* dst_r_basereg */ \
2575
8, /* src_basereg */ \
2576
15 /* mask_basereg */
2578
/******************************************************************************/
2580
.macro pixman_composite_over_0565_n_0565_init
2581
add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2583
vld1.32 {d15[0]}, [DUMMY]
2587
.macro pixman_composite_over_0565_n_0565_cleanup
2591
generate_composite_function \
2592
pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2593
FLAG_DST_READWRITE, \
2594
8, /* number of pixels, processed in a single block */ \
2595
5, /* prefetch distance */ \
2596
pixman_composite_over_0565_n_0565_init, \
2597
pixman_composite_over_0565_n_0565_cleanup, \
2598
pixman_composite_over_0565_8_0565_process_pixblock_head, \
2599
pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2600
pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2601
28, /* dst_w_basereg */ \
2602
10, /* dst_r_basereg */ \
2603
8, /* src_basereg */ \
2604
15 /* mask_basereg */
2606
/******************************************************************************/
2608
.macro pixman_composite_add_0565_8_0565_process_pixblock_head
2609
/* mask is in d15 */
2610
convert_0565_to_x888 q4, d2, d1, d0
2611
convert_0565_to_x888 q5, d6, d5, d4
2612
/* source pixel data is in {d0, d1, d2, XX} */
2613
/* destination pixel data is in {d4, d5, d6, XX} */
2614
vmull.u8 q6, d15, d2
2615
vmull.u8 q5, d15, d1
2616
vmull.u8 q4, d15, d0
2617
vrshr.u16 q12, q6, #8
2618
vrshr.u16 q11, q5, #8
2619
vrshr.u16 q10, q4, #8
2620
vraddhn.u16 d2, q6, q12
2621
vraddhn.u16 d1, q5, q11
2622
vraddhn.u16 d0, q4, q10
2625
.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2628
/* 32bpp result is in {d0, d1, d2, XX} */
2629
convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2632
/* TODO: expand macros and do better instructions scheduling */
2633
.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2635
pixman_composite_add_0565_8_0565_process_pixblock_tail
2637
vld1.16 {d10, d11}, [DST_R, :128]!
2639
pixman_composite_add_0565_8_0565_process_pixblock_head
2640
vst1.16 {d28, d29}, [DST_W, :128]!
2643
generate_composite_function \
2644
pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2645
FLAG_DST_READWRITE, \
2646
8, /* number of pixels, processed in a single block */ \
2647
5, /* prefetch distance */ \
2648
default_init_need_all_regs, \
2649
default_cleanup_need_all_regs, \
2650
pixman_composite_add_0565_8_0565_process_pixblock_head, \
2651
pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2652
pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2653
28, /* dst_w_basereg */ \
2654
10, /* dst_r_basereg */ \
2655
8, /* src_basereg */ \
2656
15 /* mask_basereg */
2658
/******************************************************************************/
2660
.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2661
/* mask is in d15 */
2662
convert_0565_to_x888 q5, d6, d5, d4
2663
/* destination pixel data is in {d4, d5, d6, xx} */
2664
vmvn.8 d24, d15 /* get inverted alpha */
2665
/* now do alpha blending */
2666
vmull.u8 q8, d24, d4
2667
vmull.u8 q9, d24, d5
2668
vmull.u8 q10, d24, d6
2671
.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2672
vrshr.u16 q14, q8, #8
2673
vrshr.u16 q15, q9, #8
2674
vrshr.u16 q12, q10, #8
2675
vraddhn.u16 d0, q14, q8
2676
vraddhn.u16 d1, q15, q9
2677
vraddhn.u16 d2, q12, q10
2678
/* 32bpp result is in {d0, d1, d2, XX} */
2679
convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2682
/* TODO: expand macros and do better instructions scheduling */
2683
.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2685
pixman_composite_out_reverse_8_0565_process_pixblock_tail
2686
vld1.16 {d10, d11}, [DST_R, :128]!
2688
pixman_composite_out_reverse_8_0565_process_pixblock_head
2689
vst1.16 {d28, d29}, [DST_W, :128]!
2692
generate_composite_function \
2693
pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2694
FLAG_DST_READWRITE, \
2695
8, /* number of pixels, processed in a single block */ \
2696
5, /* prefetch distance */ \
2697
default_init_need_all_regs, \
2698
default_cleanup_need_all_regs, \
2699
pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2700
pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2701
pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2702
28, /* dst_w_basereg */ \
2703
10, /* dst_r_basereg */ \
2704
15, /* src_basereg */ \
2705
0 /* mask_basereg */
2707
/******************************************************************************/
2709
.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2711
/* destination pixel data is in {d4, d5, d6, d7} */
2712
vmvn.8 d1, d0 /* get inverted alpha */
2713
/* now do alpha blending */
2716
vmull.u8 q10, d1, d6
2717
vmull.u8 q11, d1, d7
2720
.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2721
vrshr.u16 q14, q8, #8
2722
vrshr.u16 q15, q9, #8
2723
vrshr.u16 q12, q10, #8
2724
vrshr.u16 q13, q11, #8
2725
vraddhn.u16 d28, q14, q8
2726
vraddhn.u16 d29, q15, q9
2727
vraddhn.u16 d30, q12, q10
2728
vraddhn.u16 d31, q13, q11
2729
/* 32bpp result is in {d28, d29, d30, d31} */
2732
/* TODO: expand macros and do better instructions scheduling */
2733
.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2735
pixman_composite_out_reverse_8_8888_process_pixblock_tail
2736
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2738
pixman_composite_out_reverse_8_8888_process_pixblock_head
2739
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2742
generate_composite_function \
2743
pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2744
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2745
8, /* number of pixels, processed in a single block */ \
2746
5, /* prefetch distance */ \
2749
pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2750
pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2751
pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2752
28, /* dst_w_basereg */ \
2753
4, /* dst_r_basereg */ \
2754
0, /* src_basereg */ \
2755
0 /* mask_basereg */
2757
/******************************************************************************/
2759
generate_composite_function_nearest_scanline \
2760
pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2761
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2762
8, /* number of pixels, processed in a single block */ \
2765
pixman_composite_over_8888_8888_process_pixblock_head, \
2766
pixman_composite_over_8888_8888_process_pixblock_tail, \
2767
pixman_composite_over_8888_8888_process_pixblock_tail_head
2769
generate_composite_function_nearest_scanline \
2770
pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2771
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2772
8, /* number of pixels, processed in a single block */ \
2775
pixman_composite_over_8888_0565_process_pixblock_head, \
2776
pixman_composite_over_8888_0565_process_pixblock_tail, \
2777
pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2778
28, /* dst_w_basereg */ \
2779
4, /* dst_r_basereg */ \
2780
0, /* src_basereg */ \
2781
24 /* mask_basereg */
2783
generate_composite_function_nearest_scanline \
2784
pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2785
FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2786
8, /* number of pixels, processed in a single block */ \
2789
pixman_composite_src_8888_0565_process_pixblock_head, \
2790
pixman_composite_src_8888_0565_process_pixblock_tail, \
2791
pixman_composite_src_8888_0565_process_pixblock_tail_head
2793
generate_composite_function_nearest_scanline \
2794
pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2795
FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2796
8, /* number of pixels, processed in a single block */ \
2799
pixman_composite_src_0565_8888_process_pixblock_head, \
2800
pixman_composite_src_0565_8888_process_pixblock_tail, \
2801
pixman_composite_src_0565_8888_process_pixblock_tail_head
2803
generate_composite_function_nearest_scanline \
2804
pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2805
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2806
8, /* number of pixels, processed in a single block */ \
2807
default_init_need_all_regs, \
2808
default_cleanup_need_all_regs, \
2809
pixman_composite_over_8888_8_0565_process_pixblock_head, \
2810
pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2811
pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2812
28, /* dst_w_basereg */ \
2813
4, /* dst_r_basereg */ \
2814
8, /* src_basereg */ \
2815
24 /* mask_basereg */
2817
generate_composite_function_nearest_scanline \
2818
pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2819
FLAG_DST_READWRITE, \
2820
8, /* number of pixels, processed in a single block */ \
2821
default_init_need_all_regs, \
2822
default_cleanup_need_all_regs, \
2823
pixman_composite_over_0565_8_0565_process_pixblock_head, \
2824
pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2825
pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2826
28, /* dst_w_basereg */ \
2827
10, /* dst_r_basereg */ \
2828
8, /* src_basereg */ \
2829
15 /* mask_basereg */
2831
/******************************************************************************/
2833
/* Supplementary macro for setting function attributes */
2834
.macro pixman_asm_function fname
2839
.type fname, %function
2845
* Bilinear scaling support code which tries to provide pixel fetching, color
2846
* format conversion, and interpolation as separate macros which can be used
2847
* as the basic building blocks for constructing bilinear scanline functions.
2850
.macro bilinear_load_8888 reg1, reg2, tmp
2851
mov TMP1, X, asr #16
2853
add TMP1, TOP, TMP1, asl #2
2854
vld1.32 {reg1}, [TMP1], STRIDE
2855
vld1.32 {reg2}, [TMP1]
2858
.macro bilinear_load_0565 reg1, reg2, tmp
2859
mov TMP1, X, asr #16
2861
add TMP1, TOP, TMP1, asl #1
2862
vld1.32 {reg2[0]}, [TMP1], STRIDE
2863
vld1.32 {reg2[1]}, [TMP1]
2864
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2867
.macro bilinear_load_and_vertical_interpolate_two_8888 \
2868
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2870
bilinear_load_8888 reg1, reg2, tmp1
2871
vmull.u8 acc1, reg1, d28
2872
vmlal.u8 acc1, reg2, d29
2873
bilinear_load_8888 reg3, reg4, tmp2
2874
vmull.u8 acc2, reg3, d28
2875
vmlal.u8 acc2, reg4, d29
2878
.macro bilinear_load_and_vertical_interpolate_four_8888 \
2879
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2880
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2882
bilinear_load_and_vertical_interpolate_two_8888 \
2883
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2884
bilinear_load_and_vertical_interpolate_two_8888 \
2885
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2888
.macro bilinear_load_and_vertical_interpolate_two_0565 \
2889
acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2891
mov TMP1, X, asr #16
2893
add TMP1, TOP, TMP1, asl #1
2894
mov TMP2, X, asr #16
2896
add TMP2, TOP, TMP2, asl #1
2897
vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2898
vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2899
vld1.32 {acc2lo[1]}, [TMP1]
2900
vld1.32 {acc2hi[1]}, [TMP2]
2901
convert_0565_to_x888 acc2, reg3, reg2, reg1
2906
vmull.u8 acc1, reg1, d28
2907
vmlal.u8 acc1, reg2, d29
2908
vmull.u8 acc2, reg3, d28
2909
vmlal.u8 acc2, reg4, d29
2912
.macro bilinear_load_and_vertical_interpolate_four_0565 \
2913
xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2914
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2916
mov TMP1, X, asr #16
2918
add TMP1, TOP, TMP1, asl #1
2919
mov TMP2, X, asr #16
2921
add TMP2, TOP, TMP2, asl #1
2922
vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2923
vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2924
vld1.32 {xacc2lo[1]}, [TMP1]
2925
vld1.32 {xacc2hi[1]}, [TMP2]
2926
convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2927
mov TMP1, X, asr #16
2929
add TMP1, TOP, TMP1, asl #1
2930
mov TMP2, X, asr #16
2932
add TMP2, TOP, TMP2, asl #1
2933
vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2934
vzip.u8 xreg1, xreg3
2935
vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2936
vzip.u8 xreg2, xreg4
2937
vld1.32 {yacc2lo[1]}, [TMP1]
2938
vzip.u8 xreg3, xreg4
2939
vld1.32 {yacc2hi[1]}, [TMP2]
2940
vzip.u8 xreg1, xreg2
2941
convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2942
vmull.u8 xacc1, xreg1, d28
2943
vzip.u8 yreg1, yreg3
2944
vmlal.u8 xacc1, xreg2, d29
2945
vzip.u8 yreg2, yreg4
2946
vmull.u8 xacc2, xreg3, d28
2947
vzip.u8 yreg3, yreg4
2948
vmlal.u8 xacc2, xreg4, d29
2949
vzip.u8 yreg1, yreg2
2950
vmull.u8 yacc1, yreg1, d28
2951
vmlal.u8 yacc1, yreg2, d29
2952
vmull.u8 yacc2, yreg3, d28
2953
vmlal.u8 yacc2, yreg4, d29
2956
.macro bilinear_store_8888 numpix, tmp1, tmp2
2958
vst1.32 {d0, d1}, [OUT, :128]!
2960
vst1.32 {d0}, [OUT, :64]!
2962
vst1.32 {d0[0]}, [OUT, :32]!
2964
.error bilinear_store_8888 numpix is unsupported
2968
.macro bilinear_store_0565 numpix, tmp1, tmp2
2973
convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2975
vst1.16 {d2}, [OUT, :64]!
2977
vst1.32 {d2[0]}, [OUT, :32]!
2979
vst1.16 {d2[0]}, [OUT, :16]!
2981
.error bilinear_store_0565 numpix is unsupported
2985
.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2986
bilinear_load_&src_fmt d0, d1, d2
2987
vmull.u8 q1, d0, d28
2988
vmlal.u8 q1, d1, d29
2989
/* 5 cycles bubble */
2990
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
2991
vmlsl.u16 q0, d2, d30
2992
vmlal.u16 q0, d3, d30
2993
/* 5 cycles bubble */
2994
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
2995
/* 3 cycles bubble */
2997
/* 1 cycle bubble */
2998
bilinear_store_&dst_fmt 1, q2, q3
3001
.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
3002
bilinear_load_and_vertical_interpolate_two_&src_fmt \
3003
q1, q11, d0, d1, d20, d21, d22, d23
3004
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3005
vmlsl.u16 q0, d2, d30
3006
vmlal.u16 q0, d3, d30
3007
vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3008
vmlsl.u16 q10, d22, d31
3009
vmlal.u16 q10, d23, d31
3010
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3011
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3012
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3013
vadd.u16 q12, q12, q13
3015
bilinear_store_&dst_fmt 2, q2, q3
3018
.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
3019
bilinear_load_and_vertical_interpolate_four_&src_fmt \
3020
q1, q11, d0, d1, d20, d21, d22, d23 \
3021
q3, q9, d4, d5, d16, d17, d18, d19
3023
sub TMP1, TMP1, STRIDE
3024
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3025
vmlsl.u16 q0, d2, d30
3026
vmlal.u16 q0, d3, d30
3027
vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3028
vmlsl.u16 q10, d22, d31
3029
vmlal.u16 q10, d23, d31
3030
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3031
vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
3032
vmlsl.u16 q2, d6, d30
3033
vmlal.u16 q2, d7, d30
3034
vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
3036
vmlsl.u16 q8, d18, d31
3037
vmlal.u16 q8, d19, d31
3038
vadd.u16 q12, q12, q13
3039
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3040
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3041
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3042
vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
3043
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3046
vadd.u16 q12, q12, q13
3047
bilinear_store_&dst_fmt 4, q2, q3
3050
.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3051
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3052
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
3054
bilinear_interpolate_four_pixels src_fmt, dst_fmt
3058
.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3059
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3060
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
3064
.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3065
.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3066
bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
3068
bilinear_interpolate_four_pixels src_fmt, dst_fmt
3072
.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3073
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3074
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
3076
bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3077
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3081
.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3082
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3083
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
3085
bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3089
.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3090
.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3091
bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
3093
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3094
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3098
.set BILINEAR_FLAG_UNROLL_4, 0
3099
.set BILINEAR_FLAG_UNROLL_8, 1
3100
.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
3103
* Main template macro for generating NEON optimized bilinear scanline
3106
* Bilinear scanline scaler macro template uses the following arguments:
3107
* fname - name of the function to generate
3108
* src_fmt - source color format (8888 or 0565)
3109
* dst_fmt - destination color format (8888 or 0565)
3110
* bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
3111
* prefetch_distance - prefetch in the source image by that many
3115
.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
3116
src_bpp_shift, dst_bpp_shift, \
3117
prefetch_distance, flags
3119
pixman_asm_function fname
3136
push {r4, r5, r6, r7, r8, r9}
3137
mov PF_OFFS, #prefetch_distance
3138
ldmia ip, {WB, X, UX, WIDTH}
3139
mul PF_OFFS, PF_OFFS, UX
3141
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3145
sub STRIDE, BOTTOM, TOP
3155
vadd.u16 d25, d25, d26
3157
/* ensure good destination alignment */
3160
tst OUT, #(1 << dst_bpp_shift)
3162
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3163
vadd.u16 q12, q12, q13
3164
bilinear_interpolate_last_pixel src_fmt, dst_fmt
3165
sub WIDTH, WIDTH, #1
3167
vadd.u16 q13, q13, q13
3168
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3169
vadd.u16 q12, q12, q13
3173
tst OUT, #(1 << (dst_bpp_shift + 1))
3175
bilinear_interpolate_two_pixels src_fmt, dst_fmt
3176
sub WIDTH, WIDTH, #2
3178
.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3179
/*********** 8 pixels per iteration *****************/
3182
tst OUT, #(1 << (dst_bpp_shift + 2))
3184
bilinear_interpolate_four_pixels src_fmt, dst_fmt
3185
sub WIDTH, WIDTH, #4
3187
subs WIDTH, WIDTH, #8
3189
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3190
bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3191
subs WIDTH, WIDTH, #8
3194
bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3195
subs WIDTH, WIDTH, #8
3198
bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3202
bilinear_interpolate_four_pixels src_fmt, dst_fmt
3205
/*********** 4 pixels per iteration *****************/
3206
subs WIDTH, WIDTH, #4
3208
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3209
bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3210
subs WIDTH, WIDTH, #4
3213
bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3214
subs WIDTH, WIDTH, #4
3217
bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3219
/****************************************************/
3221
/* handle the remaining trailing pixels */
3224
bilinear_interpolate_two_pixels src_fmt, dst_fmt
3228
bilinear_interpolate_last_pixel src_fmt, dst_fmt
3230
.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3233
pop {r4, r5, r6, r7, r8, r9}
3253
/*****************************************************************************/
3255
.set have_bilinear_interpolate_four_pixels_8888_8888, 1
3257
.macro bilinear_interpolate_four_pixels_8888_8888_head
3258
mov TMP1, X, asr #16
3260
add TMP1, TOP, TMP1, asl #2
3261
mov TMP2, X, asr #16
3263
add TMP2, TOP, TMP2, asl #2
3265
vld1.32 {d22}, [TMP1], STRIDE
3266
vld1.32 {d23}, [TMP1]
3267
mov TMP3, X, asr #16
3269
add TMP3, TOP, TMP3, asl #2
3270
vmull.u8 q8, d22, d28
3271
vmlal.u8 q8, d23, d29
3273
vld1.32 {d22}, [TMP2], STRIDE
3274
vld1.32 {d23}, [TMP2]
3275
mov TMP4, X, asr #16
3277
add TMP4, TOP, TMP4, asl #2
3278
vmull.u8 q9, d22, d28
3279
vmlal.u8 q9, d23, d29
3281
vld1.32 {d22}, [TMP3], STRIDE
3282
vld1.32 {d23}, [TMP3]
3283
vmull.u8 q10, d22, d28
3284
vmlal.u8 q10, d23, d29
3286
vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3287
vmlsl.u16 q0, d16, d30
3288
vmlal.u16 q0, d17, d30
3291
vld1.32 {d16}, [TMP4], STRIDE
3292
vld1.32 {d17}, [TMP4]
3294
vmull.u8 q11, d16, d28
3295
vmlal.u8 q11, d17, d29
3297
vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3298
vmlsl.u16 q1, d18, d31
3301
.macro bilinear_interpolate_four_pixels_8888_8888_tail
3302
vmlal.u16 q1, d19, d31
3303
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3304
vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3305
vmlsl.u16 q2, d20, d30
3306
vmlal.u16 q2, d21, d30
3307
vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3308
vmlsl.u16 q3, d22, d31
3309
vmlal.u16 q3, d23, d31
3310
vadd.u16 q12, q12, q13
3311
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3312
vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3313
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3314
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3315
vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3318
vadd.u16 q12, q12, q13
3319
vst1.32 {d6, d7}, [OUT, :128]!
3322
.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3323
mov TMP1, X, asr #16
3325
add TMP1, TOP, TMP1, asl #2
3326
mov TMP2, X, asr #16
3328
add TMP2, TOP, TMP2, asl #2
3329
vmlal.u16 q1, d19, d31
3330
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3331
vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3332
vmlsl.u16 q2, d20, d30
3333
vmlal.u16 q2, d21, d30
3334
vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3335
vld1.32 {d20}, [TMP1], STRIDE
3336
vmlsl.u16 q3, d22, d31
3337
vmlal.u16 q3, d23, d31
3338
vld1.32 {d21}, [TMP1]
3339
vmull.u8 q8, d20, d28
3340
vmlal.u8 q8, d21, d29
3341
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3342
vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3343
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3344
vld1.32 {d22}, [TMP2], STRIDE
3345
vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3346
vadd.u16 q12, q12, q13
3347
vld1.32 {d23}, [TMP2]
3348
vmull.u8 q9, d22, d28
3349
mov TMP3, X, asr #16
3351
add TMP3, TOP, TMP3, asl #2
3352
mov TMP4, X, asr #16
3354
add TMP4, TOP, TMP4, asl #2
3355
vmlal.u8 q9, d23, d29
3356
vld1.32 {d22}, [TMP3], STRIDE
3357
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3358
vld1.32 {d23}, [TMP3]
3359
vmull.u8 q10, d22, d28
3360
vmlal.u8 q10, d23, d29
3362
vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3364
vmlsl.u16 q0, d16, d30
3365
vmlal.u16 q0, d17, d30
3367
vld1.32 {d16}, [TMP4], STRIDE
3368
vadd.u16 q12, q12, q13
3369
vld1.32 {d17}, [TMP4]
3371
vmull.u8 q11, d16, d28
3372
vmlal.u8 q11, d17, d29
3373
vst1.32 {d6, d7}, [OUT, :128]!
3374
vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3375
vmlsl.u16 q1, d18, d31
3378
/*****************************************************************************/
3380
.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3382
.macro bilinear_interpolate_eight_pixels_8888_0565_head
3383
mov TMP1, X, asr #16
3385
add TMP1, TOP, TMP1, asl #2
3386
mov TMP2, X, asr #16
3388
add TMP2, TOP, TMP2, asl #2
3389
vld1.32 {d20}, [TMP1], STRIDE
3390
vld1.32 {d21}, [TMP1]
3391
vmull.u8 q8, d20, d28
3392
vmlal.u8 q8, d21, d29
3393
vld1.32 {d22}, [TMP2], STRIDE
3394
vld1.32 {d23}, [TMP2]
3395
vmull.u8 q9, d22, d28
3396
mov TMP3, X, asr #16
3398
add TMP3, TOP, TMP3, asl #2
3399
mov TMP4, X, asr #16
3401
add TMP4, TOP, TMP4, asl #2
3402
vmlal.u8 q9, d23, d29
3403
vld1.32 {d22}, [TMP3], STRIDE
3404
vld1.32 {d23}, [TMP3]
3405
vmull.u8 q10, d22, d28
3406
vmlal.u8 q10, d23, d29
3407
vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3408
vmlsl.u16 q0, d16, d30
3409
vmlal.u16 q0, d17, d30
3411
vld1.32 {d16}, [TMP4], STRIDE
3412
vld1.32 {d17}, [TMP4]
3414
vmull.u8 q11, d16, d28
3415
vmlal.u8 q11, d17, d29
3416
vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3417
vmlsl.u16 q1, d18, d31
3419
mov TMP1, X, asr #16
3421
add TMP1, TOP, TMP1, asl #2
3422
mov TMP2, X, asr #16
3424
add TMP2, TOP, TMP2, asl #2
3425
vmlal.u16 q1, d19, d31
3426
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3427
vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3428
vmlsl.u16 q2, d20, d30
3429
vmlal.u16 q2, d21, d30
3430
vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3431
vld1.32 {d20}, [TMP1], STRIDE
3432
vmlsl.u16 q3, d22, d31
3433
vmlal.u16 q3, d23, d31
3434
vld1.32 {d21}, [TMP1]
3435
vmull.u8 q8, d20, d28
3436
vmlal.u8 q8, d21, d29
3437
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3438
vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3439
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3440
vld1.32 {d22}, [TMP2], STRIDE
3441
vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3442
vadd.u16 q12, q12, q13
3443
vld1.32 {d23}, [TMP2]
3444
vmull.u8 q9, d22, d28
3445
mov TMP3, X, asr #16
3447
add TMP3, TOP, TMP3, asl #2
3448
mov TMP4, X, asr #16
3450
add TMP4, TOP, TMP4, asl #2
3451
vmlal.u8 q9, d23, d29
3452
vld1.32 {d22}, [TMP3], STRIDE
3453
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3454
vld1.32 {d23}, [TMP3]
3455
vmull.u8 q10, d22, d28
3456
vmlal.u8 q10, d23, d29
3458
vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3460
vmlsl.u16 q0, d16, d30
3461
vmlal.u16 q0, d17, d30
3463
vld1.32 {d16}, [TMP4], STRIDE
3464
vadd.u16 q12, q12, q13
3465
vld1.32 {d17}, [TMP4]
3467
vmull.u8 q11, d16, d28
3468
vmlal.u8 q11, d17, d29
3469
vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3470
vmlsl.u16 q1, d18, d31
3473
.macro bilinear_interpolate_eight_pixels_8888_0565_tail
3474
vmlal.u16 q1, d19, d31
3475
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3476
vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3477
vmlsl.u16 q2, d20, d30
3478
vmlal.u16 q2, d21, d30
3479
vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3480
vmlsl.u16 q3, d22, d31
3481
vmlal.u16 q3, d23, d31
3482
vadd.u16 q12, q12, q13
3483
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3484
vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3485
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3486
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3487
vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3490
vadd.u16 q12, q12, q13
3497
vshll.u8 q5, d10, #8
3500
vsri.u16 q5, q7, #11
3501
vst1.32 {d10, d11}, [OUT, :128]!
3504
.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3505
mov TMP1, X, asr #16
3507
add TMP1, TOP, TMP1, asl #2
3508
mov TMP2, X, asr #16
3510
add TMP2, TOP, TMP2, asl #2
3511
vmlal.u16 q1, d19, d31
3512
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3514
vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3515
vmlsl.u16 q2, d20, d30
3516
vmlal.u16 q2, d21, d30
3517
vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3518
vld1.32 {d20}, [TMP1], STRIDE
3519
vmlsl.u16 q3, d22, d31
3520
vmlal.u16 q3, d23, d31
3521
vld1.32 {d21}, [TMP1]
3522
vmull.u8 q8, d20, d28
3523
vmlal.u8 q8, d21, d29
3524
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3525
vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3526
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3527
vld1.32 {d22}, [TMP2], STRIDE
3528
vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3529
vadd.u16 q12, q12, q13
3530
vld1.32 {d23}, [TMP2]
3531
vmull.u8 q9, d22, d28
3532
mov TMP3, X, asr #16
3534
add TMP3, TOP, TMP3, asl #2
3535
mov TMP4, X, asr #16
3537
add TMP4, TOP, TMP4, asl #2
3538
vmlal.u8 q9, d23, d29
3539
vld1.32 {d22}, [TMP3], STRIDE
3540
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3541
vld1.32 {d23}, [TMP3]
3542
vmull.u8 q10, d22, d28
3543
vmlal.u8 q10, d23, d29
3545
vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3547
vmlsl.u16 q0, d16, d30
3548
vmlal.u16 q0, d17, d30
3550
vld1.32 {d16}, [TMP4], STRIDE
3551
vadd.u16 q12, q12, q13
3552
vld1.32 {d17}, [TMP4]
3554
vmull.u8 q11, d16, d28
3555
vmlal.u8 q11, d17, d29
3557
vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3558
vmlsl.u16 q1, d18, d31
3560
mov TMP1, X, asr #16
3562
add TMP1, TOP, TMP1, asl #2
3563
mov TMP2, X, asr #16
3565
add TMP2, TOP, TMP2, asl #2
3566
vmlal.u16 q1, d19, d31
3568
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3569
vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3571
vmlsl.u16 q2, d20, d30
3572
vmlal.u16 q2, d21, d30
3573
vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3574
vld1.32 {d20}, [TMP1], STRIDE
3575
vmlsl.u16 q3, d22, d31
3576
vmlal.u16 q3, d23, d31
3577
vld1.32 {d21}, [TMP1]
3578
vmull.u8 q8, d20, d28
3579
vmlal.u8 q8, d21, d29
3581
vshll.u8 q5, d10, #8
3583
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3585
vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3586
vsri.u16 q5, q7, #11
3587
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3588
vld1.32 {d22}, [TMP2], STRIDE
3589
vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3590
vadd.u16 q12, q12, q13
3591
vld1.32 {d23}, [TMP2]
3592
vmull.u8 q9, d22, d28
3593
mov TMP3, X, asr #16
3595
add TMP3, TOP, TMP3, asl #2
3596
mov TMP4, X, asr #16
3598
add TMP4, TOP, TMP4, asl #2
3599
vmlal.u8 q9, d23, d29
3600
vld1.32 {d22}, [TMP3], STRIDE
3601
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3602
vld1.32 {d23}, [TMP3]
3603
vmull.u8 q10, d22, d28
3604
vmlal.u8 q10, d23, d29
3606
vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3608
vmlsl.u16 q0, d16, d30
3609
vmlal.u16 q0, d17, d30
3611
vld1.32 {d16}, [TMP4], STRIDE
3612
vadd.u16 q12, q12, q13
3613
vld1.32 {d17}, [TMP4]
3615
vmull.u8 q11, d16, d28
3616
vmlal.u8 q11, d17, d29
3617
vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3618
vst1.32 {d10, d11}, [OUT, :128]!
3619
vmlsl.u16 q1, d18, d31
3621
/*****************************************************************************/
3623
generate_bilinear_scanline_func \
3624
pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3625
2, 2, 28, BILINEAR_FLAG_UNROLL_4
3627
generate_bilinear_scanline_func \
3628
pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3629
2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3631
generate_bilinear_scanline_func \
3632
pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3633
1, 2, 28, BILINEAR_FLAG_UNROLL_4
3635
generate_bilinear_scanline_func \
3636
pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3637
1, 1, 28, BILINEAR_FLAG_UNROLL_4