2
@ ARMv4L optimized DSP utils
3
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
5
@ This file is part of FFmpeg.
7
@ FFmpeg is free software; you can redistribute it and/or
8
@ modify it under the terms of the GNU Lesser General Public
9
@ License as published by the Free Software Foundation; either
10
@ version 2.1 of the License, or (at your option) any later version.
12
@ FFmpeg is distributed in the hope that it will be useful,
13
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
@ Lesser General Public License for more details.
17
@ You should have received a copy of the GNU Lesser General Public
18
@ License along with FFmpeg; if not, write to the Free Software
19
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33
function ff_prefetch_arm, export=1
42
.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
43
mov \Rd0, \Rn0, lsr #(\shift * 8)
44
mov \Rd1, \Rn1, lsr #(\shift * 8)
45
mov \Rd2, \Rn2, lsr #(\shift * 8)
46
mov \Rd3, \Rn3, lsr #(\shift * 8)
47
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
48
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
49
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
50
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
52
.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
53
mov \R0, \R0, lsr #(\shift * 8)
54
orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
55
mov \R1, \R1, lsr #(\shift * 8)
56
orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
58
.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
59
mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
60
mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
61
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
62
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
65
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
66
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
73
and \Rd0, \Rd0, \Rmask
74
and \Rd1, \Rd1, \Rmask
75
sub \Rd0, \Rn0, \Rd0, lsr #1
76
sub \Rd1, \Rn1, \Rd1, lsr #1
79
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
80
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
87
and \Rd0, \Rd0, \Rmask
88
and \Rd1, \Rd1, \Rmask
89
add \Rd0, \Rn0, \Rd0, lsr #1
90
add \Rd1, \Rn1, \Rd1, lsr #1
93
@ ----------------------------------------------------------------
95
function put_pixels16_arm, export=1
96
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
97
@ block = word aligned, pixles = unaligned
99
stmfd sp!, {r4-r11, lr} @ R14 is also called LR
103
add r5, r5, r4, lsl #2
113
ldmfd sp!, {r4-r11, pc}
118
ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
124
ldmfd sp!, {r4-r11, pc}
129
ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
135
ldmfd sp!, {r4-r11, pc}
140
ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
146
ldmfd sp!, {r4-r11,pc}
155
@ ----------------------------------------------------------------
157
function put_pixels8_arm, export=1
158
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
159
@ block = word aligned, pixles = unaligned
161
stmfd sp!, {r4-r5,lr} @ R14 is also called LR
165
add r5, r5, r4, lsl #2
175
ldmfd sp!, {r4-r5,pc}
178
ldmia r1, {r4-r5, r12}
180
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
186
ldmfd sp!, {r4-r5,pc}
189
ldmia r1, {r4-r5, r12}
191
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
197
ldmfd sp!, {r4-r5,pc}
200
ldmia r1, {r4-r5, r12}
202
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
208
ldmfd sp!, {r4-r5,pc}
217
@ ----------------------------------------------------------------
219
function put_pixels8_x2_arm, export=1
220
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
221
@ block = word aligned, pixles = unaligned
223
stmfd sp!, {r4-r10,lr} @ R14 is also called LR
227
add r5, r5, r4, lsl #2
231
ldmia r1, {r4-r5, r10}
233
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
235
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
240
ldmfd sp!, {r4-r10,pc}
243
ldmia r1, {r4-r5, r10}
245
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
246
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
248
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
253
ldmfd sp!, {r4-r10,pc}
256
ldmia r1, {r4-r5, r10}
258
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
259
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
261
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
266
ldmfd sp!, {r4-r10,pc}
269
ldmia r1, {r4-r5, r10}
271
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
273
RND_AVG32 r8, r9, r6, r7, r5, r10, r12
278
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
288
function put_no_rnd_pixels8_x2_arm, export=1
289
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
290
@ block = word aligned, pixles = unaligned
292
stmfd sp!, {r4-r10,lr} @ R14 is also called LR
296
add r5, r5, r4, lsl #2
300
ldmia r1, {r4-r5, r10}
302
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
304
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
309
ldmfd sp!, {r4-r10,pc}
312
ldmia r1, {r4-r5, r10}
314
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
315
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
317
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
322
ldmfd sp!, {r4-r10,pc}
325
ldmia r1, {r4-r5, r10}
327
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
328
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
330
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
335
ldmfd sp!, {r4-r10,pc}
338
ldmia r1, {r4-r5, r10}
340
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
342
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
347
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
357
@ ----------------------------------------------------------------
359
function put_pixels8_y2_arm, export=1
360
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
361
@ block = word aligned, pixles = unaligned
363
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
368
add r5, r5, r4, lsl #2
377
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
383
RND_AVG32 r8, r9, r6, r7, r4, r5, r12
388
ldmfd sp!, {r4-r11,pc}
394
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
398
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
399
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
405
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
407
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
411
ldmfd sp!, {r4-r11,pc}
417
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
421
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
422
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
428
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
430
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
434
ldmfd sp!, {r4-r11,pc}
440
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
444
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
445
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
451
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
453
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
457
ldmfd sp!, {r4-r11,pc}
468
function put_no_rnd_pixels8_y2_arm, export=1
469
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
470
@ block = word aligned, pixles = unaligned
472
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
477
add r5, r5, r4, lsl #2
486
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
492
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
497
ldmfd sp!, {r4-r11,pc}
503
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
507
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
508
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
514
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
516
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
520
ldmfd sp!, {r4-r11,pc}
526
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
530
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
531
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
537
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
539
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
543
ldmfd sp!, {r4-r11,pc}
549
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
553
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
554
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
560
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
562
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
566
ldmfd sp!, {r4-r11,pc}
575
@ ----------------------------------------------------------------
576
.macro RND_XY2_IT align
577
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
578
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
589
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
591
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
592
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
594
ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
595
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
597
ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
599
ldr r14, [r12, #0] @ 0x03030303
605
ldreq r14, [r12, #16] @ 0x02020202/0x01010101
610
ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2
611
and r4, r14, r4, lsr #2
612
and r5, r14, r5, lsr #2
613
and r6, r14, r6, lsr #2
614
and r7, r14, r7, lsr #2
620
.macro RND_XY2_EXPAND align
622
6: stmfd sp!, {r8-r11}
629
ldr r14, [r12, #24] @ 0x0F0F0F0F
630
and r4, r14, r4, lsr #2
631
and r5, r14, r5, lsr #2
637
ldmfd sp!, {r4-r11,pc}
641
function put_pixels8_xy2_arm, export=1
642
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
643
@ block = word aligned, pixles = unaligned
645
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
648
add r5, r12, r4, lsl #2
672
.word 0xFCFCFCFC >> 2
677
function put_no_rnd_pixels8_xy2_arm, export=1
678
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
679
@ block = word aligned, pixles = unaligned
681
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
684
add r5, r12, r4, lsl #2
708
.word 0xFCFCFCFC >> 2
712
@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride)
713
function ff_add_pixels_clamped_ARM, export=1
717
ldr r4, [r1] /* load dest */
718
/* block[0] and block[1]*/
724
add r8, r7, r8, lsr #8
728
movne r6, r5, lsr #24
730
movne r8, r7, lsr #24
732
ldrsh r5, [r0, #4] /* moved form [A] */
733
orr r9, r9, r8, lsl #8
734
/* block[2] and block[3] */
737
and r6, r4, #0xFF0000
738
and r8, r4, #0xFF000000
739
add r6, r5, r6, lsr #16
740
add r8, r7, r8, lsr #24
744
movne r6, r5, lsr #24
746
movne r8, r7, lsr #24
747
orr r9, r9, r6, lsl #16
748
ldr r4, [r1, #4] /* moved form [B] */
749
orr r9, r9, r8, lsl #24
751
ldrsh r5, [r0, #8] /* moved form [C] */
756
/* block[4] and block[5] */
762
add r8, r7, r8, lsr #8
766
movne r6, r5, lsr #24
768
movne r8, r7, lsr #24
770
ldrsh r5, [r0, #12] /* moved from [D] */
771
orr r9, r9, r8, lsl #8
772
/* block[6] and block[7] */
775
and r6, r4, #0xFF0000
776
and r8, r4, #0xFF000000
777
add r6, r5, r6, lsr #16
778
add r8, r7, r8, lsr #24
782
movne r6, r5, lsr #24
784
movne r8, r7, lsr #24
785
orr r9, r9, r6, lsl #16
786
add r0, r0, #16 /* moved from [E] */
787
orr r9, r9, r8, lsl #24
788
subs r10, r10, #1 /* moved from [F] */