304
305
/******************************************************************************/
307
.macro src_x888_0565_init
308
/* Hold loop invariant in MASK */
309
ldr MASK, =0x001F001F
310
line_saved_regs STRIDE_S, ORIG_W
313
.macro src_x888_0565_1pixel s, d
314
and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
315
and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
316
orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
317
orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
318
/* Top 16 bits are discarded during the following STRH */
321
.macro src_x888_0565_2pixels slo, shi, d, tmp
322
and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
323
and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
324
and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
325
orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
326
orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
327
and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
328
orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
329
orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
330
pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
333
.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
339
pixld , 16, 4, SRC, 0
340
src_x888_0565_2pixels 4, 5, 0, 0
342
src_x888_0565_2pixels 6, 7, 1, 1
345
pixld , numbytes*2, 4, SRC, 0
349
.macro src_x888_0565_process_tail cond, numbytes, firstreg
351
src_x888_0565_2pixels 4, 5, 2, 2
352
src_x888_0565_2pixels 6, 7, 3, 4
353
.elseif numbytes == 8
354
src_x888_0565_2pixels 4, 5, 1, 1
355
src_x888_0565_2pixels 6, 7, 2, 2
356
.elseif numbytes == 4
357
src_x888_0565_2pixels 4, 5, 1, 1
359
src_x888_0565_1pixel 4, 1
362
pixst , numbytes, 0, DST
364
pixst , numbytes, 1, DST
372
generate_composite_function \
373
pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
374
FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
375
3, /* prefetch distance */ \
376
src_x888_0565_init, \
377
nop_macro, /* newline */ \
378
nop_macro, /* cleanup */ \
379
src_x888_0565_process_head, \
380
src_x888_0565_process_tail
382
/******************************************************************************/
306
384
.macro add_8_8_8pixels cond, dst1, dst2
307
385
uqadd8&cond WK&dst1, WK&dst1, MASK
308
386
uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
612
690
/******************************************************************************/
692
.macro over_reverse_n_8888_init
693
ldr SRC, [sp, #ARGS_STACK_OFFSET]
694
ldr MASK, =0x00800080
695
/* Split source pixel into RB/AG parts */
697
uxtb16 STRIDE_M, SRC, ror #8
698
/* Set GE[3:0] to 0101 so SEL instructions do what we want */
699
uadd8 SCRATCH, MASK, MASK
700
line_saved_regs STRIDE_D, ORIG_W
703
.macro over_reverse_n_8888_newline
707
.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
708
pixld , numbytes, firstreg, DST, 0
711
.macro over_reverse_n_8888_1pixel d, is_only
713
beq 8f /* replace with source */
714
bics ORIG_W, STRIDE_D, WK&d, lsr #24
716
beq 49f /* skip store */
718
beq 9f /* write same value back */
720
mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
721
mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */
722
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
723
uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
724
mov SCRATCH, SCRATCH, ror #8
725
sel ORIG_W, SCRATCH, ORIG_W
726
uqadd8 WK&d, WK&d, ORIG_W
732
.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
734
over_reverse_n_8888_1pixel reg1, 1
736
and SCRATCH, WK®1, WK®2
738
and SCRATCH, SCRATCH, WK®3
739
and SCRATCH, SCRATCH, WK®4
741
mvns SCRATCH, SCRATCH, asr #24
742
beq 49f /* skip store if all opaque */
743
over_reverse_n_8888_1pixel reg1, 0
744
over_reverse_n_8888_1pixel reg2, 0
746
over_reverse_n_8888_1pixel reg3, 0
747
over_reverse_n_8888_1pixel reg4, 0
750
pixst , numbytes, reg1, DST
754
.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
755
over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
758
generate_composite_function \
759
pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
760
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
761
3, /* prefetch distance */ \
762
over_reverse_n_8888_init, \
763
over_reverse_n_8888_newline, \
764
nop_macro, /* cleanup */ \
765
over_reverse_n_8888_process_head, \
766
over_reverse_n_8888_process_tail
768
/******************************************************************************/
770
.macro over_white_8888_8888_ca_init
777
line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
778
ldr SCRATCH, =0x800080
780
/* Set GE[3:0] to 0101 so SEL instructions do what we want */
781
uadd8 SCRATCH, SCRATCH, SCRATCH
782
.set DST_PRELOAD_BIAS, 8
785
.macro over_white_8888_8888_ca_cleanup
786
.set DST_PRELOAD_BIAS, 0
795
.macro over_white_8888_8888_ca_combine m, d
796
uxtb16 TMP1, TMP0 /* rb_notmask */
797
uxtb16 TMP2, d /* rb_dest; 1 stall follows */
798
smlatt TMP3, TMP2, TMP1, HALF /* red */
799
smlabb TMP2, TMP2, TMP1, HALF /* blue */
800
uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
801
uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
802
smlatt d, TMP1, TMP0, HALF /* alpha */
803
smlabb TMP1, TMP1, TMP0, HALF /* green */
804
pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
805
pkhbt TMP1, TMP1, d, lsl #16 /* ag */
806
uxtab16 TMP0, TMP0, TMP0, ror #8
807
uxtab16 TMP1, TMP1, TMP1, ror #8
808
mov TMP0, TMP0, ror #8
810
uqadd8 d, d, m /* d is a late result */
813
.macro over_white_8888_8888_ca_1pixel_head
814
pixld , 4, 1, MASK, 0
818
.macro over_white_8888_8888_ca_1pixel_tail
820
teq WK1, WK1, asr #32
825
01: over_white_8888_8888_ca_combine WK1, WK3
826
02: pixst , 4, 3, DST
830
.macro over_white_8888_8888_ca_2pixels_head
831
pixld , 8, 1, MASK, 0
834
.macro over_white_8888_8888_ca_2pixels_tail
837
teq WK1, WK1, asr #32
844
01: over_white_8888_8888_ca_combine WK1, WK3
846
teq WK2, WK2, asr #32
850
03: over_white_8888_8888_ca_combine WK2, WK4
851
04: pixst , 8, 3, DST
855
.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
857
over_white_8888_8888_ca_1pixel_head
860
over_white_8888_8888_ca_2pixels_head
861
over_white_8888_8888_ca_2pixels_tail
863
over_white_8888_8888_ca_2pixels_head
867
.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
869
over_white_8888_8888_ca_1pixel_tail
871
over_white_8888_8888_ca_2pixels_tail
875
generate_composite_function \
876
pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
877
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
878
2, /* prefetch distance */ \
879
over_white_8888_8888_ca_init, \
880
nop_macro, /* newline */ \
881
over_white_8888_8888_ca_cleanup, \
882
over_white_8888_8888_ca_process_head, \
883
over_white_8888_8888_ca_process_tail
886
.macro over_n_8888_8888_ca_init
887
/* Set up constants. RB_SRC and AG_SRC are in registers;
888
* RB_FLDS, A_SRC, and the two HALF values need to go on the
889
* stack (and the ful SRC value is already there) */
890
ldr SCRATCH, [sp, #ARGS_STACK_OFFSET]
892
orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */
893
mov WK1, #0x80 /* HALF default value */
894
mov WK2, SCRATCH, lsr #24 /* A_SRC */
895
orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
897
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
899
uxtb16 STRIDE_S, SCRATCH, ror #8
901
/* Set GE[3:0] to 0101 so SEL instructions do what we want */
902
uadd8 SCRATCH, WK3, WK3
913
RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */
921
line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
924
.macro over_n_8888_8888_ca_cleanup
926
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
946
.macro over_n_8888_8888_ca_1pixel_head
947
pixld , 4, 6, MASK, 0
951
.macro over_n_8888_8888_ca_1pixel_tail
952
ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
953
uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
954
teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */
957
/* Mask is fully opaque (all channels) */
958
ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
959
eors A_SRC, A_SRC, #0xFF
961
/* Source is also opaque - same as src_8888_8888 */
964
10: /* Same as over_8888_8888 */
965
mul_8888_8 WK0, A_SRC, WK5, HALF
968
20: /* No simplifications possible - do it the hard way */
969
uxtb16 WK2, WK6, ror #8 /* ag_mask */
970
mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */
971
mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */
972
ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
973
uxtb16 WK5, WK0 /* rb_dest */
974
uxtab16 WK3, WK3, WK3, ror #8
975
uxtb16 WK6, WK0, ror #8 /* ag_dest */
976
uxtab16 WK4, WK4, WK4, ror #8
977
smlatt WK0, RB_SRC, WK1, HALF /* red1 */
978
smlabb WK1, RB_SRC, WK1, HALF /* blue1 */
979
bic WK3, RB_FLDS, WK3, lsr #8
980
bic WK4, RB_FLDS, WK4, lsr #8
981
pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */
982
smlatt WK0, WK5, WK3, HALF /* red2 */
983
smlabb WK3, WK5, WK3, HALF /* blue2 */
984
uxtab16 WK1, WK1, WK1, ror #8
985
smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */
986
pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */
987
smlabb WK0, AG_SRC, WK2, HALF /* green1 */
988
smlatt WK2, WK6, WK4, HALF /* alpha2 */
989
smlabb WK4, WK6, WK4, HALF /* green2 */
990
pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */
991
uxtab16 WK3, WK3, WK3, ror #8
992
pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */
993
uxtab16 WK0, WK0, WK0, ror #8
994
uxtab16 WK4, WK4, WK4, ror #8
997
sel WK2, WK1, WK0 /* recombine source*mask */
998
sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */
999
uqadd8 WK0, WK1, WK2 /* followed by 1 stall */
1000
30: /* The destination buffer is already in the L1 cache, so
1001
* there's little point in amalgamating writes */
1006
.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1007
.rept (numbytes / 4) - 1
1008
over_n_8888_8888_ca_1pixel_head
1009
over_n_8888_8888_ca_1pixel_tail
1011
over_n_8888_8888_ca_1pixel_head
1014
.macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg
1015
over_n_8888_8888_ca_1pixel_tail
1018
pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
1021
beq pixman_composite_over_white_8888_8888_ca_asm_armv6
1022
/* else drop through... */
1024
generate_composite_function \
1025
pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
1026
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
1027
2, /* prefetch distance */ \
1028
over_n_8888_8888_ca_init, \
1029
nop_macro, /* newline */ \
1030
over_n_8888_8888_ca_cleanup, \
1031
over_n_8888_8888_ca_process_head, \
1032
over_n_8888_8888_ca_process_tail
1034
/******************************************************************************/
1036
.macro in_reverse_8888_8888_init
1037
/* Hold loop invariant in MASK */
1038
ldr MASK, =0x00800080
1039
/* Set GE[3:0] to 0101 so SEL instructions do what we want */
1040
uadd8 SCRATCH, MASK, MASK
1041
/* Offset the source pointer: we only need the alpha bytes */
1043
line_saved_regs ORIG_W
1046
.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
1047
ldrb ORIG_W, [SRC], #4
1049
ldrb WK®1, [SRC], #4
1051
ldrb WK®2, [SRC], #4
1052
ldrb WK®3, [SRC], #4
1055
add DST, DST, #numbytes
1058
.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1059
in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
1062
.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
1066
ldrb ORIG_W, [SRC, #offset]
1072
uxtb16 SCRATCH, d /* rb_dest */
1073
uxtb16 d, d, ror #8 /* ag_dest */
1074
mla SCRATCH, SCRATCH, s, MASK
1076
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
1077
uxtab16 d, d, d, ror #8
1078
mov SCRATCH, SCRATCH, ror #8
1082
48: /* Last mov d,#0 of the set - used as part of shortcut for
1083
* source values all 0 */
1089
.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
1091
teq ORIG_W, ORIG_W, asr #32
1092
ldrne WK®1, [DST, #-4]
1093
.elseif numbytes == 8
1095
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
1096
ldmnedb DST, {WK®1-WK®2}
1099
teqeq ORIG_W, WK®2
1100
teqeq ORIG_W, WK®3
1101
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
1102
ldmnedb DST, {WK®1-WK®4}
1104
cmnne DST, #0 /* clear C if NE */
1105
bcs 49f /* no writes to dest if source all -1 */
1106
beq 48f /* set dest to all 0 if source all 0 */
1108
in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1
1109
str WK®1, [DST, #-4]
1110
.elseif numbytes == 8
1111
in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0
1112
in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0
1113
stmdb DST, {WK®1-WK®2}
1115
in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0
1116
in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0
1117
in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0
1118
in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0
1119
stmdb DST, {WK®1-WK®4}
1124
.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
1125
in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
1128
generate_composite_function \
1129
pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
1130
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
1131
2, /* prefetch distance */ \
1132
in_reverse_8888_8888_init, \
1133
nop_macro, /* newline */ \
1134
nop_macro, /* cleanup */ \
1135
in_reverse_8888_8888_process_head, \
1136
in_reverse_8888_8888_process_tail
1138
/******************************************************************************/