2
* VP8 NEON optimisations
4
* Copyright (c) 2010 Rob Clark <rob@ti.com>
5
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
7
* This file is part of Libav.
9
* Libav is free software; you can redistribute it and/or
10
* modify it under the terms of the GNU Lesser General Public
11
* License as published by the Free Software Foundation; either
12
* version 2.1 of the License, or (at your option) any later version.
14
* Libav is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
* Lesser General Public License for more details.
19
* You should have received a copy of the GNU Lesser General Public
20
* License along with Libav; if not, write to the Free Software
21
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26
function ff_vp8_luma_dc_wht_neon, export=1
27
vld1.16 {q0-q1}, [r1,:128]
32
vst1.16 {q15}, [r1,:128]!
35
vst1.16 {q15}, [r1,:128]
59
vst1.16 {d0[0]}, [r0,:16], r3
60
vst1.16 {d1[0]}, [r0,:16], r3
61
vst1.16 {d2[0]}, [r0,:16], r3
62
vst1.16 {d3[0]}, [r0,:16], r3
63
vst1.16 {d0[1]}, [r0,:16], r3
64
vst1.16 {d1[1]}, [r0,:16], r3
65
vst1.16 {d2[1]}, [r0,:16], r3
66
vst1.16 {d3[1]}, [r0,:16], r3
67
vst1.16 {d0[2]}, [r0,:16], r3
68
vst1.16 {d1[2]}, [r0,:16], r3
69
vst1.16 {d2[2]}, [r0,:16], r3
70
vst1.16 {d3[2]}, [r0,:16], r3
71
vst1.16 {d0[3]}, [r0,:16], r3
72
vst1.16 {d1[3]}, [r0,:16], r3
73
vst1.16 {d2[3]}, [r0,:16], r3
74
vst1.16 {d3[3]}, [r0,:16], r3
79
function ff_vp8_luma_dc_wht_dc_neon, export=1
91
function ff_vp8_idct_add_neon, export=1
92
vld1.16 {q0-q1}, [r1,:128]
97
vmull.s16 q12, d1, d4[0]
98
vmull.s16 q13, d3, d4[0]
99
vqdmulh.s16 d20, d1, d4[1]
100
vqdmulh.s16 d23, d3, d4[1]
101
vshrn.s32 d21, q12, #16
102
vshrn.s32 d22, q13, #16
103
vadd.s16 d21, d21, d1
104
vadd.s16 d22, d22, d3
108
vadd.s16 d18, d21, d23
109
vsub.s16 d19, d20, d22
119
vmull.s16 q12, d1, d4[0]
120
vst1.16 {q15}, [r1,:128]!
121
vmull.s16 q13, d2, d4[0]
122
vst1.16 {q15}, [r1,:128]
123
vqdmulh.s16 d21, d1, d4[1]
124
vqdmulh.s16 d23, d2, d4[1]
125
vshrn.s32 d20, q12, #16
126
vshrn.s32 d22, q13, #16
127
vadd.i16 d20, d20, d1
128
vadd.i16 d22, d22, d2
132
vadd.i16 d18, d20, d23
133
vld1.32 {d20[]}, [r0,:32], r2
134
vsub.i16 d19, d21, d22
135
vld1.32 {d22[]}, [r0,:32], r2
137
vld1.32 {d23[]}, [r0,:32], r2
139
vld1.32 {d21[]}, [r0,:32], r2
144
sub r0, r0, r2, lsl #2
156
vst1.32 {d0[0]}, [r0,:32], r2
157
vst1.32 {d0[1]}, [r0,:32], r2
158
vst1.32 {d1[1]}, [r0,:32], r2
159
vst1.32 {d1[0]}, [r0,:32], r2
164
function ff_vp8_idct_dc_add_neon, export=1
170
vld1.32 {d0[]}, [r0,:32], r2
171
vld1.32 {d1[]}, [r0,:32], r2
172
vld1.32 {d0[1]}, [r0,:32], r2
173
vld1.32 {d1[1]}, [r0,:32], r2
176
sub r0, r0, r2, lsl #2
179
vst1.32 {d0[0]}, [r0,:32], r2
180
vst1.32 {d1[0]}, [r0,:32], r2
181
vst1.32 {d0[1]}, [r0,:32], r2
182
vst1.32 {d1[1]}, [r0,:32], r2
186
function ff_vp8_idct_dc_add4uv_neon, export=1
189
vld1.16 {d16[]}, [r1,:16]
190
vst1.16 {d0[0]}, [r1,:16], r3
191
vld1.16 {d17[]}, [r1,:16]
192
vst1.16 {d0[0]}, [r1,:16], r3
193
vld1.16 {d18[]}, [r1,:16]
194
vst1.16 {d0[0]}, [r1,:16], r3
195
vld1.16 {d19[]}, [r1,:16]
196
vst1.16 {d0[0]}, [r1,:16], r3
198
vrshr.s16 q8, q8, #3 @ dc >>= 3
199
vld1.8 {d0}, [r0,:64], r2
201
vld1.8 {d1}, [r0,:64], r2
203
vld1.8 {d2}, [r0,:64], r2
205
vld1.8 {d3}, [r0,:64], r2
207
vld1.8 {d4}, [r0,:64], r2
209
vld1.8 {d5}, [r0,:64], r2
211
vld1.8 {d6}, [r0,:64], r2
213
vld1.8 {d7}, [r0,:64], r2
219
vst1.8 {d20}, [r3,:64], r2
221
vst1.8 {d21}, [r3,:64], r2
223
vst1.8 {d22}, [r3,:64], r2
225
vst1.8 {d23}, [r3,:64], r2
227
vst1.8 {d24}, [r3,:64], r2
229
vst1.8 {d25}, [r3,:64], r2
230
vst1.8 {d26}, [r3,:64], r2
231
vst1.8 {d27}, [r3,:64], r2
236
function ff_vp8_idct_dc_add4y_neon, export=1
239
vld1.16 {d16[]}, [r1,:16]
240
vst1.16 {d0[0]}, [r1,:16], r3
241
vld1.16 {d17[]}, [r1,:16]
242
vst1.16 {d0[0]}, [r1,:16], r3
243
vld1.16 {d18[]}, [r1,:16]
244
vst1.16 {d0[0]}, [r1,:16], r3
245
vld1.16 {d19[]}, [r1,:16]
246
vst1.16 {d0[0]}, [r1,:16], r3
247
vrshr.s16 q8, q8, #3 @ dc >>= 3
248
vld1.8 {q0}, [r0,:128], r2
250
vld1.8 {q1}, [r0,:128], r2
252
vld1.8 {q2}, [r0,:128], r2
254
vld1.8 {q3}, [r0,:128], r2
261
sub r0, r0, r2, lsl #2
267
vst1.8 {q10}, [r0,:128], r2
269
vst1.8 {q11}, [r0,:128], r2
271
vst1.8 {q12}, [r0,:128], r2
273
vst1.8 {q13}, [r0,:128], r2
284
.macro vp8_loop_filter, inner=0, simple=0
286
vabd.u8 q9, q3, q4 @ abs(P0-Q0)
287
vabd.u8 q15, q2, q5 @ abs(P1-Q1)
288
vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
289
vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
290
vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
292
vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
294
@ calculate hev and normal_limit:
295
vabd.u8 q12, q2, q3 @ abs(P1-P0)
296
vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
297
vabd.u8 q10, q0, q1 @ abs(P3-P2)
298
vabd.u8 q11, q1, q2 @ abs(P2-P1)
299
vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
300
vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
301
vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
302
vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
304
vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
306
vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
308
vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
309
vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
310
vabd.u8 q9, q3, q4 @ abs(P0-Q0)
311
vabd.u8 q15, q2, q5 @ abs(P1-Q1)
313
vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
315
vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
316
vdup.8 q15, r12 @ hev_thresh
317
vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
318
vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
319
vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
320
vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
330
@ convert to signed value:
331
veor q3, q3, q13 @ PS0 = P0 ^ 0x80
332
veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
335
vsubl.s8 q10, d8, d6 @ QS0 - PS0
336
vsubl.s8 q11, d9, d7 @ (widened to 16bit)
337
veor q2, q2, q13 @ PS1 = P1 ^ 0x80
338
veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
339
vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
340
vmul.i16 q11, q11, q12
342
vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
346
vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
348
vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
349
vaddw.s8 q11, q11, d25
350
vqmovn.s16 d20, q10 @ narrow result back into q10
352
.if !\inner && !\simple
353
veor q1, q1, q13 @ PS2 = P2 ^ 0x80
354
veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
356
vand q10, q10, q8 @ w &= normal_limit
358
@ registers used at this point..
359
@ q0 -> P3 (don't corrupt)
361
@ q7 -> Q3 (don't corrupt)
367
@ q8, q11, q12 -> unused
369
@ filter_common: is4tap==1
370
@ c1 = clamp(w + 4) >> 3;
371
@ c2 = clamp(w + 3) >> 3;
372
@ Q0 = s2u(QS0 - c1);
373
@ P0 = s2u(PS0 + c2);
376
vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
377
vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
378
vshr.s8 q11, q11, #3 @ c1 >>= 3
379
vshr.s8 q12, q12, #3 @ c2 >>= 3
380
vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
381
vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
382
veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
383
veor q3, q3, q13 @ P0 = PS0 ^ 0x80
384
veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
385
veor q2, q2, q13 @ P1 = PS1 ^ 0x80
387
@ the !is4tap case of filter_common, only used for inner blocks
388
@ c3 = ((c1&~hev) + 1) >> 1;
389
@ Q1 = s2u(QS1 - c3);
390
@ P1 = s2u(PS1 + c3);
391
vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
392
vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
393
vshr.s8 q11, q11, #3 @ c1 >>= 3
394
vshr.s8 q12, q12, #3 @ c2 >>= 3
395
vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
396
vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
397
vbic q11, q11, q9 @ c1 & ~hev
398
veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
399
vrshr.s8 q11, q11, #1 @ c3 >>= 1
400
veor q3, q3, q13 @ P0 = PS0 ^ 0x80
401
vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
402
vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
403
veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
404
veor q2, q2, q13 @ P1 = PS1 ^ 0x80
406
vand q12, q10, q9 @ w & hev
407
vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
408
vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
409
vshr.s8 q11, q11, #3 @ c1 >>= 3
410
vshr.s8 q12, q12, #3 @ c2 >>= 3
411
vbic q10, q10, q9 @ w &= ~hev
412
vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
413
vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
416
@ a = clamp((27*w + 63) >> 7);
419
@ a = clamp((18*w + 63) >> 7);
422
@ a = clamp((9*w + 63) >> 7);
426
vshll.s8 q14, d20, #3
427
vshll.s8 q15, d21, #3
428
vaddw.s8 q14, q14, d20
429
vaddw.s8 q15, q15, d21
431
vadd.s16 q9, q9, q15 @ 9*w + 63
432
vadd.s16 q11, q8, q14
433
vadd.s16 q12, q9, q15 @ 18*w + 63
434
vadd.s16 q14, q11, q14
435
vadd.s16 q15, q12, q15 @ 27*w + 63
436
vqshrn.s16 d16, q8, #7
437
vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
438
vqshrn.s16 d22, q11, #7
439
vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
440
vqshrn.s16 d28, q14, #7
441
vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
442
vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
443
vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
444
vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
445
vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
446
vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
447
vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
448
veor q3, q3, q13 @ P0 = PS0 ^ 0x80
449
veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
450
veor q2, q2, q13 @ P1 = PS1 ^ 0x80
451
veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
452
veor q1, q1, q13 @ P2 = PS2 ^ 0x80
453
veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
457
.macro transpose8x16matrix
474
.macro vp8_v_loop_filter16 name, inner=0, simple=0
475
function ff_vp8_v_loop_filter16\name\()_neon, export=1
477
sub r0, r0, r1, lsl #1+!\simple
481
ldr r12, [sp, #64] @ hev_thresh
482
vld1.8 {q0}, [r0,:128], r1 @ P3
483
vld1.8 {q1}, [r0,:128], r1 @ P2
485
vld1.8 {q2}, [r0,:128], r1 @ P1
486
vld1.8 {q3}, [r0,:128], r1 @ P0
487
vld1.8 {q4}, [r0,:128], r1 @ Q0
488
vld1.8 {q5}, [r0,:128], r1 @ Q1
490
vld1.8 {q6}, [r0,:128], r1 @ Q2
491
vld1.8 {q7}, [r0,:128] @ Q3
492
vdup.8 q15, r3 @ flim_I
494
vdup.8 q14, r2 @ flim_E
496
vp8_loop_filter inner=\inner, simple=\simple
498
@ back up to P2: dst -= stride * 6
499
sub r0, r0, r1, lsl #2
501
sub r0, r0, r1, lsl #1
504
vst1.8 {q1}, [r0,:128], r1 @ P2
506
vst1.8 {q2}, [r0,:128], r1 @ P1
507
vst1.8 {q3}, [r0,:128], r1 @ P0
508
vst1.8 {q4}, [r0,:128], r1 @ Q0
509
vst1.8 {q5}, [r0,:128], r1 @ Q1
511
vst1.8 {q6}, [r0,:128] @ Q2
520
vp8_v_loop_filter16 _inner, inner=1
521
vp8_v_loop_filter16 _simple, simple=1
523
.macro vp8_v_loop_filter8uv name, inner=0
524
function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
526
sub r0, r0, r2, lsl #2
527
sub r1, r1, r2, lsl #2
528
ldr r12, [sp, #64] @ flim_I
531
vld1.8 {d0}, [r0,:64], r2 @ P3
532
vld1.8 {d1}, [r1,:64], r2 @ P3
533
vld1.8 {d2}, [r0,:64], r2 @ P2
534
vld1.8 {d3}, [r1,:64], r2 @ P2
535
vld1.8 {d4}, [r0,:64], r2 @ P1
536
vld1.8 {d5}, [r1,:64], r2 @ P1
537
vld1.8 {d6}, [r0,:64], r2 @ P0
538
vld1.8 {d7}, [r1,:64], r2 @ P0
539
vld1.8 {d8}, [r0,:64], r2 @ Q0
540
vld1.8 {d9}, [r1,:64], r2 @ Q0
541
vld1.8 {d10}, [r0,:64], r2 @ Q1
542
vld1.8 {d11}, [r1,:64], r2 @ Q1
543
vld1.8 {d12}, [r0,:64], r2 @ Q2
544
vld1.8 {d13}, [r1,:64], r2 @ Q2
545
vld1.8 {d14}, [r0,:64] @ Q3
546
vld1.8 {d15}, [r1,:64] @ Q3
548
vdup.8 q14, r3 @ flim_E
549
vdup.8 q15, r12 @ flim_I
550
ldr r12, [sp, #68] @ hev_thresh
552
vp8_loop_filter inner=\inner
554
@ back up to P2: u,v -= stride * 6
555
sub r0, r0, r2, lsl #2
556
sub r1, r1, r2, lsl #2
557
sub r0, r0, r2, lsl #1
558
sub r1, r1, r2, lsl #1
561
vst1.8 {d2}, [r0,:64], r2 @ P2
562
vst1.8 {d3}, [r1,:64], r2 @ P2
563
vst1.8 {d4}, [r0,:64], r2 @ P1
564
vst1.8 {d5}, [r1,:64], r2 @ P1
565
vst1.8 {d6}, [r0,:64], r2 @ P0
566
vst1.8 {d7}, [r1,:64], r2 @ P0
567
vst1.8 {d8}, [r0,:64], r2 @ Q0
568
vst1.8 {d9}, [r1,:64], r2 @ Q0
569
vst1.8 {d10}, [r0,:64], r2 @ Q1
570
vst1.8 {d11}, [r1,:64], r2 @ Q1
571
vst1.8 {d12}, [r0,:64] @ Q2
572
vst1.8 {d13}, [r1,:64] @ Q2
580
vp8_v_loop_filter8uv _inner, inner=1
582
.macro vp8_h_loop_filter16 name, inner=0, simple=0
583
function ff_vp8_h_loop_filter16\name\()_neon, export=1
587
ldr r12, [sp, #64] @ hev_thresh
591
vld1.8 {d0}, [r0], r1 @ load first 8-line src data
592
vld1.8 {d2}, [r0], r1
593
vld1.8 {d4}, [r0], r1
594
vld1.8 {d6}, [r0], r1
595
vld1.8 {d8}, [r0], r1
596
vld1.8 {d10}, [r0], r1
597
vld1.8 {d12}, [r0], r1
598
vld1.8 {d14}, [r0], r1
599
vld1.8 {d1}, [r0], r1 @ load second 8-line src data
600
vld1.8 {d3}, [r0], r1
601
vld1.8 {d5}, [r0], r1
602
vld1.8 {d7}, [r0], r1
603
vld1.8 {d9}, [r0], r1
604
vld1.8 {d11}, [r0], r1
605
vld1.8 {d13}, [r0], r1
606
vld1.8 {d15}, [r0], r1
610
vdup.8 q14, r2 @ flim_E
612
vdup.8 q15, r3 @ flim_I
615
vp8_loop_filter inner=\inner, simple=\simple
617
sub r0, r0, r1, lsl #4 @ backup 16 rows
622
vst1.8 {d0}, [r0], r1
623
vst1.8 {d2}, [r0], r1
624
vst1.8 {d4}, [r0], r1
625
vst1.8 {d6}, [r0], r1
626
vst1.8 {d8}, [r0], r1
627
vst1.8 {d10}, [r0], r1
628
vst1.8 {d12}, [r0], r1
629
vst1.8 {d14}, [r0], r1
630
vst1.8 {d1}, [r0], r1
631
vst1.8 {d3}, [r0], r1
632
vst1.8 {d5}, [r0], r1
633
vst1.8 {d7}, [r0], r1
634
vst1.8 {d9}, [r0], r1
635
vst1.8 {d11}, [r0], r1
636
vst1.8 {d13}, [r0], r1
645
vp8_h_loop_filter16 _inner, inner=1
646
vp8_h_loop_filter16 _simple, simple=1
648
.macro vp8_h_loop_filter8uv name, inner=0
649
function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
653
ldr r12, [sp, #64] @ flim_I
656
vld1.8 {d0}, [r0], r2 @ load u
657
vld1.8 {d1}, [r1], r2 @ load v
658
vld1.8 {d2}, [r0], r2
659
vld1.8 {d3}, [r1], r2
660
vld1.8 {d4}, [r0], r2
661
vld1.8 {d5}, [r1], r2
662
vld1.8 {d6}, [r0], r2
663
vld1.8 {d7}, [r1], r2
664
vld1.8 {d8}, [r0], r2
665
vld1.8 {d9}, [r1], r2
666
vld1.8 {d10}, [r0], r2
667
vld1.8 {d11}, [r1], r2
668
vld1.8 {d12}, [r0], r2
669
vld1.8 {d13}, [r1], r2
670
vld1.8 {d14}, [r0], r2
671
vld1.8 {d15}, [r1], r2
675
vdup.8 q14, r3 @ flim_E
676
vdup.8 q15, r12 @ flim_I
677
ldr r12, [sp, #68] @ hev_thresh
679
vp8_loop_filter inner=\inner
681
sub r0, r0, r2, lsl #3 @ backup u 8 rows
682
sub r1, r1, r2, lsl #3 @ backup v 8 rows
687
vst1.8 {d0}, [r0], r2
688
vst1.8 {d1}, [r1], r2
689
vst1.8 {d2}, [r0], r2
690
vst1.8 {d3}, [r1], r2
691
vst1.8 {d4}, [r0], r2
692
vst1.8 {d5}, [r1], r2
693
vst1.8 {d6}, [r0], r2
694
vst1.8 {d7}, [r1], r2
695
vst1.8 {d8}, [r0], r2
696
vst1.8 {d9}, [r1], r2
697
vst1.8 {d10}, [r0], r2
698
vst1.8 {d11}, [r1], r2
699
vst1.8 {d12}, [r0], r2
700
vst1.8 {d13}, [r1], r2
710
vp8_h_loop_filter8uv _inner, inner=1
712
function ff_put_vp8_pixels16_neon, export=1
713
ldr r12, [sp, #0] @ h
716
vld1.8 {q0}, [r2], r3
717
vld1.8 {q1}, [r2], r3
718
vld1.8 {q2}, [r2], r3
719
vld1.8 {q3}, [r2], r3
720
vst1.8 {q0}, [r0,:128], r1
721
vst1.8 {q1}, [r0,:128], r1
722
vst1.8 {q2}, [r0,:128], r1
723
vst1.8 {q3}, [r0,:128], r1
728
function ff_put_vp8_pixels8_neon, export=1
729
ldr r12, [sp, #0] @ h
732
vld1.8 {d0}, [r2], r3
733
vld1.8 {d1}, [r2], r3
734
vld1.8 {d2}, [r2], r3
735
vld1.8 {d3}, [r2], r3
736
vst1.8 {d0}, [r0,:64], r1
737
vst1.8 {d1}, [r0,:64], r1
738
vst1.8 {d2}, [r0,:64], r1
739
vst1.8 {d3}, [r0,:64], r1
744
function ff_put_vp8_pixels4_neon, export=1
745
ldr r12, [sp, #0] @ h
761
/* 4/6-tap 8th-pel MC */
763
.macro vp8_epel8_h6 d, a, b
764
vext.8 d27, \a, \b, #1
766
vext.8 d28, \a, \b, #2
768
vext.8 d29, \a, \b, #3
770
vext.8 d30, \a, \b, #4
772
vext.8 d31, \a, \b, #5
774
vmul.u16 q10, q10, d0[2]
776
vmul.u16 q11, q11, d0[3]
777
vmls.u16 q10, q9, d0[1]
778
vmls.u16 q11, q12, d1[0]
779
vmla.u16 q10, q8, d0[0]
780
vmla.u16 q11, q13, d1[1]
781
vqadd.s16 q11, q10, q11
782
vqrshrun.s16 \d, q11, #7
785
.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
786
vext.8 q14, \q0, \q1, #3
787
vext.8 q15, \q0, \q1, #4
790
vext.8 q3, \q0, \q1, #2
793
vext.8 q8, \q0, \q1, #1
796
vext.8 q2, \q0, \q1, #5
801
vmul.u16 q11, q11, d0[3]
802
vmul.u16 q10, q10, d0[2]
803
vmul.u16 q3, q3, d0[2]
804
vmul.u16 q14, q14, d0[3]
805
vmls.u16 q11, q12, d1[0]
808
vmls.u16 q10, q9, d0[1]
809
vmls.u16 q3, q8, d0[1]
810
vmls.u16 q14, q15, d1[0]
811
vmla.u16 q10, q12, d0[0]
812
vmla.u16 q11, q13, d1[1]
813
vmla.u16 q3, q1, d0[0]
814
vmla.u16 q14, q2, d1[1]
815
vqadd.s16 q11, q10, q11
816
vqadd.s16 q14, q3, q14
817
vqrshrun.s16 \d0, q11, #7
818
vqrshrun.s16 \d1, q14, #7
821
.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
828
vmul.u16 q10, q10, d0[2]
829
vmul.u16 q11, q11, d0[3]
830
vmls.u16 q10, q9, d0[1]
831
vmls.u16 q11, q12, d1[0]
832
vmla.u16 q10, q8, d0[0]
833
vmla.u16 q11, q13, d1[1]
834
vqadd.s16 q11, q10, q11
835
vqrshrun.s16 \d0, q11, #7
838
.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
846
vmul.u16 q10, q10, d0[0]
847
vmul.u16 q15, q11, d0[3]
848
vmul.u16 q11, q11, d0[2]
849
vmul.u16 q14, q14, d1[1]
850
vmls.u16 q10, q9, d0[1]
851
vmls.u16 q15, q12, d1[0]
852
vmls.u16 q11, q8, d0[1]
853
vmls.u16 q14, q13, d1[0]
854
vmla.u16 q10, q8, d0[2]
855
vmla.u16 q15, q13, d1[1]
856
vmla.u16 q11, q9, d0[0]
857
vmla.u16 q14, q12, d0[3]
858
vqadd.s16 q15, q10, q15
859
vqadd.s16 q14, q11, q14
860
vqrshrun.s16 \d0, q15, #7
861
vqrshrun.s16 \d1, q14, #7
864
.macro vp8_epel8_h4 d, a, b
865
vext.8 d28, \a, \b, #1
867
vext.8 d29, \a, \b, #2
869
vext.8 d30, \a, \b, #3
872
vmul.u16 q10, q10, d0[2]
873
vmul.u16 q11, q11, d0[3]
874
vmls.u16 q10, q9, d0[1]
875
vmls.u16 q11, q12, d1[0]
876
vqadd.s16 q11, q10, q11
877
vqrshrun.s16 \d, q11, #7
880
.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
886
vmul.u16 q8, q10, d0[2]
887
vmul.u16 q14, q11, d0[3]
888
vmul.u16 q11, q11, d0[2]
889
vmul.u16 q15, q12, d0[3]
890
vmls.u16 q8, q9, d0[1]
891
vmls.u16 q14, q12, d1[0]
892
vmls.u16 q11, q10, d0[1]
893
vmls.u16 q15, q13, d1[0]
894
vqadd.s16 q8, q8, q14
895
vqadd.s16 q11, q11, q15
896
vqrshrun.s16 \d0, q8, #7
897
vqrshrun.s16 \d1, q11, #7
900
function ff_put_vp8_epel16_v6_neon, export=1
901
sub r2, r2, r3, lsl #1
905
ldr r4, [sp, #80] @ my
906
movrel lr, subpel_filters-16
907
ldr r12, [sp, #72] @ h
908
add r4, lr, r4, lsl #4
909
vld1.16 {q0}, [r4,:128]
911
vld1.8 {d2-d3}, [r2], r3
912
vld1.8 {d4-d5}, [r2], r3
913
vld1.8 {d6-d7}, [r2], r3
914
vld1.8 {d8-d9}, [r2], r3
915
vld1.8 {d10-d11},[r2], r3
916
vld1.8 {d12-d13},[r2], r3
917
vld1.8 {d14-d15},[r2]
918
sub r2, r2, r3, lsl #2
920
vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
921
vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
923
vst1.8 {d2-d3}, [r0,:128], r1
924
vst1.8 {d4-d5}, [r0,:128], r1
932
function ff_put_vp8_epel16_h6_neon, export=1
936
ldr r4, [sp, #12] @ mx
937
movrel lr, subpel_filters-16
938
ldr r12, [sp, #8] @ h
939
add r4, lr, r4, lsl #4
940
vld1.16 {q0}, [r4,:128]
942
vld1.8 {d2-d4}, [r2], r3
944
vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
946
vst1.8 {d2-d3}, [r0,:128], r1
953
function ff_put_vp8_epel16_h6v6_neon, export=1
954
sub r2, r2, r3, lsl #1
959
@ first pass (horizontal):
960
ldr r4, [sp, #28] @ mx
961
movrel lr, subpel_filters-16
962
ldr r12, [sp, #24] @ h
963
add r4, lr, r4, lsl #4
965
vld1.16 {q0}, [r4,:128]
970
vld1.8 {d2,d3,d4}, [r2], r3
972
vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
974
vst1.8 {d2-d3}, [lr,:128]!
978
@ second pass (vertical):
979
ldr r4, [sp, #336+16+32] @ my
980
movrel lr, subpel_filters-16
981
ldr r12, [sp, #336+16+24] @ h
982
add r4, lr, r4, lsl #4
984
vld1.16 {q0}, [r4,:128]
987
vld1.8 {d2-d5}, [lr,:128]!
988
vld1.8 {d6-d9}, [lr,:128]!
989
vld1.8 {d28-d31},[lr,:128]
992
vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
993
vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
995
vst1.8 {d2-d3}, [r0,:128], r1
1004
function ff_put_vp8_epel8_v6_neon, export=1
1005
sub r2, r2, r3, lsl #1
1008
ldr r4, [sp, #16] @ my
1009
movrel lr, subpel_filters-16
1010
ldr r12, [sp, #8] @ h
1011
add r4, lr, r4, lsl #4
1012
vld1.16 {q0}, [r4,:128]
1014
vld1.8 {d2}, [r2], r3
1015
vld1.8 {d3}, [r2], r3
1016
vld1.8 {d4}, [r2], r3
1017
vld1.8 {d5}, [r2], r3
1018
vld1.8 {d6}, [r2], r3
1019
vld1.8 {d7}, [r2], r3
1022
sub r2, r2, r3, lsl #2
1024
vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1026
vst1.8 {d2}, [r0,:64], r1
1027
vst1.8 {d3}, [r0,:64], r1
1034
function ff_put_vp8_epel8_h6_neon, export=1
1038
ldr r4, [sp, #12] @ mx
1039
movrel lr, subpel_filters-16
1040
ldr r12, [sp, #8] @ h
1041
add r4, lr, r4, lsl #4
1042
vld1.16 {q0}, [r4,:128]
1044
vld1.8 {d2,d3}, [r2], r3
1046
vp8_epel8_h6 d2, d2, d3
1048
vst1.8 {d2}, [r0,:64], r1
1055
function ff_put_vp8_epel8_h6v6_neon, export=1
1056
sub r2, r2, r3, lsl #1
1060
@ first pass (horizontal):
1061
ldr r4, [sp, #12] @ mx
1062
movrel lr, subpel_filters-16
1063
ldr r12, [sp, #8] @ h
1064
add r4, lr, r4, lsl #4
1066
vld1.16 {q0}, [r4,:128]
1071
vld1.8 {d2,d3}, [r2], r3
1073
vp8_epel8_h6 d2, d2, d3
1075
vst1.8 {d2}, [lr,:64]!
1079
@ second pass (vertical):
1080
ldr r4, [sp, #168+16+16] @ my
1081
movrel lr, subpel_filters-16
1082
ldr r12, [sp, #168+16+8] @ h
1083
add r4, lr, r4, lsl #4
1085
vld1.16 {q0}, [r4,:128]
1088
vld1.8 {d2-d5}, [lr,:128]!
1089
vld1.8 {d6-d7}, [lr,:128]!
1090
vld1.8 {d30}, [lr,:64]
1093
vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1095
vst1.8 {d2}, [r0,:64], r1
1096
vst1.8 {d3}, [r0,:64], r1
1104
function ff_put_vp8_epel8_v4_neon, export=1
1108
ldr r4, [sp, #16] @ my
1109
movrel lr, subpel_filters-16
1110
ldr r12, [sp, #8] @ h
1111
add r4, lr, r4, lsl #4
1112
vld1.16 {q0}, [r4,:128]
1114
vld1.8 {d2}, [r2], r3
1115
vld1.8 {d3}, [r2], r3
1116
vld1.8 {d4}, [r2], r3
1117
vld1.8 {d5}, [r2], r3
1119
sub r2, r2, r3, lsl #1
1121
vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1123
vst1.8 {d2}, [r0,:64], r1
1124
vst1.8 {d3}, [r0,:64], r1
1131
function ff_put_vp8_epel8_h4_neon, export=1
1135
ldr r4, [sp, #12] @ mx
1136
movrel lr, subpel_filters-16
1137
ldr r12, [sp, #8] @ h
1138
add r4, lr, r4, lsl #4
1139
vld1.16 {q0}, [r4,:128]
1141
vld1.8 {d2,d3}, [r2], r3
1143
vp8_epel8_h4 d2, d2, d3
1145
vst1.8 {d2}, [r0,:64], r1
1152
function ff_put_vp8_epel8_h4v4_neon, export=1
1157
@ first pass (horizontal):
1158
ldr r4, [sp, #12] @ mx
1159
movrel lr, subpel_filters-16
1160
ldr r12, [sp, #8] @ h
1161
add r4, lr, r4, lsl #4
1163
vld1.16 {q0}, [r4,:128]
1168
vld1.8 {d2,d3}, [r2], r3
1170
vp8_epel8_h4 d2, d2, d3
1172
vst1.8 {d2}, [lr,:64]!
1176
@ second pass (vertical):
1177
ldr r4, [sp, #168+16+16] @ my
1178
movrel lr, subpel_filters-16
1179
ldr r12, [sp, #168+16+8] @ h
1180
add r4, lr, r4, lsl #4
1182
vld1.16 {q0}, [r4,:128]
1185
vld1.8 {d2-d5}, [lr,:128]!
1186
vld1.8 {d6}, [lr,:64]
1189
vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1191
vst1.8 {d2}, [r0,:64], r1
1192
vst1.8 {d3}, [r0,:64], r1
1200
function ff_put_vp8_epel8_h6v4_neon, export=1
1205
@ first pass (horizontal):
1206
ldr r4, [sp, #12] @ mx
1207
movrel lr, subpel_filters-16
1208
ldr r12, [sp, #8] @ h
1209
add r4, lr, r4, lsl #4
1211
vld1.16 {q0}, [r4,:128]
1216
vld1.8 {d2,d3}, [r2], r3
1218
vp8_epel8_h6 d2, d2, d3
1220
vst1.8 {d2}, [lr,:64]!
1224
@ second pass (vertical):
1225
ldr r4, [sp, #168+16+16] @ my
1226
movrel lr, subpel_filters-16
1227
ldr r12, [sp, #168+16+8] @ h
1228
add r4, lr, r4, lsl #4
1230
vld1.16 {q0}, [r4,:128]
1233
vld1.8 {d2-d5}, [lr,:128]!
1234
vld1.8 {d6}, [lr,:64]
1237
vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1239
vst1.8 {d2}, [r0,:64], r1
1240
vst1.8 {d3}, [r0,:64], r1
1248
function ff_put_vp8_epel8_h4v6_neon, export=1
1249
sub r2, r2, r3, lsl #1
1253
@ first pass (horizontal):
1254
ldr r4, [sp, #12] @ mx
1255
movrel lr, subpel_filters-16
1256
ldr r12, [sp, #8] @ h
1257
add r4, lr, r4, lsl #4
1259
vld1.16 {q0}, [r4,:128]
1264
vld1.8 {d2,d3}, [r2], r3
1266
vp8_epel8_h4 d2, d2, d3
1268
vst1.8 {d2}, [lr,:64]!
1272
@ second pass (vertical):
1273
ldr r4, [sp, #168+16+16] @ my
1274
movrel lr, subpel_filters-16
1275
ldr r12, [sp, #168+16+8] @ h
1276
add r4, lr, r4, lsl #4
1278
vld1.16 {q0}, [r4,:128]
1281
vld1.8 {d2-d5}, [lr,:128]!
1282
vld1.8 {d6-d7}, [lr,:128]!
1283
vld1.8 {d30}, [lr,:64]
1286
vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1288
vst1.8 {d2}, [r0,:64], r1
1289
vst1.8 {d3}, [r0,:64], r1
1299
function ff_put_vp8_epel4_v6_neon, export=1
1300
sub r2, r2, r3, lsl #1
1303
ldr r4, [sp, #16] @ my
1304
movrel lr, subpel_filters-16
1305
ldr r12, [sp, #8] @ h
1306
add r4, lr, r4, lsl #4
1307
vld1.16 {q0}, [r4,:128]
1309
vld1.32 {d2[]}, [r2], r3
1310
vld1.32 {d3[]}, [r2], r3
1311
vld1.32 {d4[]}, [r2], r3
1312
vld1.32 {d5[]}, [r2], r3
1313
vld1.32 {d6[]}, [r2], r3
1314
vld1.32 {d7[]}, [r2], r3
1315
vld1.32 {d28[]}, [r2]
1316
sub r2, r2, r3, lsl #2
1317
vld1.32 {d2[1]}, [r2], r3
1318
vld1.32 {d3[1]}, [r2], r3
1319
vld1.32 {d4[1]}, [r2], r3
1320
vld1.32 {d5[1]}, [r2], r3
1321
vld1.32 {d6[1]}, [r2], r3
1322
vld1.32 {d7[1]}, [r2], r3
1323
vld1.32 {d28[1]}, [r2]
1324
sub r2, r2, r3, lsl #2
1326
vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1328
vst1.32 {d2[0]}, [r0,:32], r1
1329
vst1.32 {d3[0]}, [r0,:32], r1
1330
vst1.32 {d2[1]}, [r0,:32], r1
1331
vst1.32 {d3[1]}, [r0,:32], r1
1338
function ff_put_vp8_epel4_h6_neon, export=1
1342
ldr r4, [sp, #12] @ mx
1343
movrel lr, subpel_filters-16
1344
ldr r12, [sp, #8] @ h
1345
add r4, lr, r4, lsl #4
1346
vld1.16 {q0}, [r4,:128]
1348
vld1.8 {q1}, [r2], r3
1349
vp8_epel8_h6 d2, d2, d3
1350
vst1.32 {d2[0]}, [r0,:32], r1
1357
function ff_put_vp8_epel4_h6v6_neon, export=1
1358
sub r2, r2, r3, lsl #1
1362
ldr r4, [sp, #12] @ mx
1363
movrel lr, subpel_filters-16
1364
ldr r12, [sp, #8] @ h
1365
add r4, lr, r4, lsl #4
1367
vld1.16 {q0}, [r4,:128]
1372
vld1.8 {q1}, [r2], r3
1373
vp8_epel8_h6 d2, d2, d3
1374
vst1.32 {d2[0]}, [lr,:32]!
1378
ldr r4, [sp, #52+16+16] @ my
1379
movrel lr, subpel_filters-16
1380
ldr r12, [sp, #52+16+8] @ h
1381
add r4, lr, r4, lsl #4
1383
vld1.16 {q0}, [r4,:128]
1386
vld1.8 {d2-d3}, [lr,:128]!
1387
vld1.8 {d6}, [lr,:64]!
1388
vld1.32 {d28[]}, [lr,:32]
1390
vld1.8 {d4-d5}, [lr]!
1391
vld1.8 {d7}, [lr,:64]!
1392
vld1.32 {d28[1]}, [lr,:32]
1396
vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1397
vst1.32 {d2[0]}, [r0,:32], r1
1398
vst1.32 {d3[0]}, [r0,:32], r1
1399
vst1.32 {d2[1]}, [r0,:32], r1
1400
vst1.32 {d3[1]}, [r0,:32], r1
1408
function ff_put_vp8_epel4_h4v6_neon, export=1
1409
sub r2, r2, r3, lsl #1
1413
ldr r4, [sp, #12] @ mx
1414
movrel lr, subpel_filters-16
1415
ldr r12, [sp, #8] @ h
1416
add r4, lr, r4, lsl #4
1418
vld1.16 {q0}, [r4,:128]
1423
vld1.8 {d2}, [r2], r3
1424
vp8_epel8_h4 d2, d2, d2
1425
vst1.32 {d2[0]}, [lr,:32]!
1429
ldr r4, [sp, #52+16+16] @ my
1430
movrel lr, subpel_filters-16
1431
ldr r12, [sp, #52+16+8] @ h
1432
add r4, lr, r4, lsl #4
1434
vld1.16 {q0}, [r4,:128]
1437
vld1.8 {d2-d3}, [lr,:128]!
1438
vld1.8 {d6}, [lr,:64]!
1439
vld1.32 {d28[]}, [lr,:32]
1441
vld1.8 {d4-d5}, [lr]!
1442
vld1.8 {d7}, [lr,:64]!
1443
vld1.32 {d28[1]}, [lr,:32]
1447
vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1448
vst1.32 {d2[0]}, [r0,:32], r1
1449
vst1.32 {d3[0]}, [r0,:32], r1
1450
vst1.32 {d2[1]}, [r0,:32], r1
1451
vst1.32 {d3[1]}, [r0,:32], r1
1459
function ff_put_vp8_epel4_h6v4_neon, export=1
1464
ldr r4, [sp, #12] @ mx
1465
movrel lr, subpel_filters-16
1466
ldr r12, [sp, #8] @ h
1467
add r4, lr, r4, lsl #4
1469
vld1.16 {q0}, [r4,:128]
1474
vld1.8 {q1}, [r2], r3
1475
vp8_epel8_h6 d2, d2, d3
1476
vst1.32 {d2[0]}, [lr,:32]!
1480
ldr r4, [sp, #44+16+16] @ my
1481
movrel lr, subpel_filters-16
1482
ldr r12, [sp, #44+16+8] @ h
1483
add r4, lr, r4, lsl #4
1485
vld1.16 {q0}, [r4,:128]
1488
vld1.8 {d2-d3}, [lr,:128]!
1489
vld1.32 {d6[]}, [lr,:32]
1491
vld1.8 {d4-d5}, [lr]!
1492
vld1.32 {d6[1]}, [lr,:32]
1495
vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1496
vst1.32 {d2[0]}, [r0,:32], r1
1497
vst1.32 {d3[0]}, [r0,:32], r1
1498
vst1.32 {d2[1]}, [r0,:32], r1
1499
vst1.32 {d3[1]}, [r0,:32], r1
1507
function ff_put_vp8_epel4_h4_neon, export=1
1511
ldr r4, [sp, #12] @ mx
1512
movrel lr, subpel_filters-16
1513
ldr r12, [sp, #8] @ h
1514
add r4, lr, r4, lsl #4
1515
vld1.16 {q0}, [r4,:128]
1517
vld1.8 {d2}, [r2], r3
1518
vp8_epel8_h4 d2, d2, d2
1519
vst1.32 {d2[0]}, [r0,:32], r1
1526
function ff_put_vp8_epel4_v4_neon, export=1
1530
ldr r4, [sp, #16] @ my
1531
movrel lr, subpel_filters-16
1532
ldr r12, [sp, #8] @ h
1533
add r4, lr, r4, lsl #4
1534
vld1.16 {q0}, [r4,:128]
1536
vld1.32 {d2[]}, [r2], r3
1537
vld1.32 {d3[]}, [r2], r3
1538
vld1.32 {d4[]}, [r2], r3
1539
vld1.32 {d5[]}, [r2], r3
1540
vld1.32 {d6[]}, [r2]
1541
sub r2, r2, r3, lsl #1
1542
vld1.32 {d2[1]}, [r2], r3
1543
vld1.32 {d3[1]}, [r2], r3
1544
vld1.32 {d4[1]}, [r2], r3
1545
vld1.32 {d5[1]}, [r2], r3
1546
vld1.32 {d6[1]}, [r2]
1547
sub r2, r2, r3, lsl #1
1549
vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1551
vst1.32 {d2[0]}, [r0,:32], r1
1552
vst1.32 {d3[0]}, [r0,:32], r1
1553
vst1.32 {d2[1]}, [r0,:32], r1
1554
vst1.32 {d3[1]}, [r0,:32], r1
1561
function ff_put_vp8_epel4_h4v4_neon, export=1
1566
ldr r4, [sp, #12] @ mx
1567
movrel lr, subpel_filters-16
1568
ldr r12, [sp, #8] @ h
1569
add r4, lr, r4, lsl #4
1571
vld1.16 {q0}, [r4,:128]
1576
vld1.8 {d2}, [r2], r3
1577
vp8_epel8_h4 d2, d2, d3
1578
vst1.32 {d2[0]}, [lr,:32]!
1582
ldr r4, [sp, #44+16+16] @ my
1583
movrel lr, subpel_filters-16
1584
ldr r12, [sp, #44+16+8] @ h
1585
add r4, lr, r4, lsl #4
1587
vld1.16 {q0}, [r4,:128]
1590
vld1.8 {d2-d3}, [lr,:128]!
1591
vld1.32 {d6[]}, [lr,:32]
1593
vld1.8 {d4-d5}, [lr]!
1594
vld1.32 {d6[1]}, [lr,:32]
1597
vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1598
vst1.32 {d2[0]}, [r0,:32], r1
1599
vst1.32 {d3[0]}, [r0,:32], r1
1600
vst1.32 {d2[1]}, [r0,:32], r1
1601
vst1.32 {d3[1]}, [r0,:32], r1
1609
@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1610
@ arithmatic can be used to apply filters
1611
const subpel_filters, align=4
1612
.short 0, 6, 123, 12, 1, 0, 0, 0
1613
.short 2, 11, 108, 36, 8, 1, 0, 0
1614
.short 0, 9, 93, 50, 6, 0, 0, 0
1615
.short 3, 16, 77, 77, 16, 3, 0, 0
1616
.short 0, 6, 50, 93, 9, 0, 0, 0
1617
.short 1, 8, 36, 108, 11, 2, 0, 0
1618
.short 0, 1, 12, 123, 6, 0, 0, 0
1623
function ff_put_vp8_bilin16_h_neon, export=1
1624
ldr r3, [sp, #4] @ mx
1631
vld1.8 {d2-d4}, [r2], r1
1632
vext.8 q2, q1, q2, #1
1635
vld1.8 {d18-d20},[r2], r1
1638
vext.8 q10, q9, q10, #1
1639
vmull.u8 q11, d18, d1
1640
vmlal.u8 q11, d20, d0
1641
vmull.u8 q12, d19, d1
1642
vmlal.u8 q12, d21, d0
1643
vrshrn.u16 d4, q8, #3
1644
vrshrn.u16 d5, q3, #3
1645
vrshrn.u16 d6, q11, #3
1646
vrshrn.u16 d7, q12, #3
1647
vst1.8 {q2}, [r0,:128], r1
1648
vst1.8 {q3}, [r0,:128], r1
1654
function ff_put_vp8_bilin16_v_neon, export=1
1655
ldr r3, [sp, #8] @ my
1660
vld1.8 {q1}, [r2], r1
1663
vld1.8 {q2}, [r2], r1
1668
vld1.8 {q1}, [r2], r1
1671
vmull.u8 q10, d5, d1
1672
vmlal.u8 q10, d3, d0
1673
vrshrn.u16 d4, q3, #3
1674
vrshrn.u16 d5, q8, #3
1675
vrshrn.u16 d6, q9, #3
1676
vrshrn.u16 d7, q10, #3
1677
vst1.8 {q2}, [r0,:128], r1
1678
vst1.8 {q3}, [r0,:128], r1
1684
function ff_put_vp8_bilin16_hv_neon, export=1
1685
ldr r3, [sp, #4] @ mx
1689
ldr r3, [sp, #8] @ my
1695
vld1.8 {d4-d6}, [r2], r1
1696
vext.8 q3, q2, q3, #1
1701
vrshrn.u16 d4, q8, #3
1702
vrshrn.u16 d5, q9, #3
1705
vld1.8 {d18-d20},[r2], r1
1706
vext.8 q10, q9, q10, #1
1707
vmull.u8 q11, d18, d1
1708
vmlal.u8 q11, d20, d0
1709
vld1.8 {d26-d28},[r2], r1
1710
vmull.u8 q12, d19, d1
1711
vmlal.u8 q12, d21, d0
1712
vext.8 q14, q13, q14, #1
1713
vmull.u8 q8, d26, d1
1714
vmlal.u8 q8, d28, d0
1715
vmull.u8 q9, d27, d1
1716
vmlal.u8 q9, d29, d0
1717
vrshrn.u16 d6, q11, #3
1718
vrshrn.u16 d7, q12, #3
1719
vmull.u8 q12, d4, d3
1720
vmlal.u8 q12, d6, d2
1721
vmull.u8 q15, d5, d3
1722
vmlal.u8 q15, d7, d2
1723
vrshrn.u16 d4, q8, #3
1724
vrshrn.u16 d5, q9, #3
1725
vmull.u8 q10, d6, d3
1726
vmlal.u8 q10, d4, d2
1727
vmull.u8 q11, d7, d3
1728
vmlal.u8 q11, d5, d2
1729
vrshrn.u16 d24, q12, #3
1730
vrshrn.u16 d25, q15, #3
1731
vst1.8 {q12}, [r0,:128], r1
1732
vrshrn.u16 d20, q10, #3
1733
vrshrn.u16 d21, q11, #3
1734
vst1.8 {q10}, [r0,:128], r1
1740
function ff_put_vp8_bilin8_h_neon, export=1
1741
ldr r3, [sp, #4] @ mx
1748
vld1.8 {q1}, [r2], r1
1749
vext.8 d3, d2, d3, #1
1752
vld1.8 {q3}, [r2], r1
1753
vext.8 d7, d6, d7, #1
1756
vrshrn.u16 d4, q2, #3
1757
vrshrn.u16 d16, q8, #3
1758
vst1.8 {d4}, [r0,:64], r1
1759
vst1.8 {d16}, [r0,:64], r1
1765
function ff_put_vp8_bilin8_v_neon, export=1
1766
ldr r3, [sp, #8] @ my
1771
vld1.8 {d2}, [r2], r1
1774
vld1.8 {d3}, [r2], r1
1777
vld1.8 {d2}, [r2], r1
1780
vrshrn.u16 d4, q2, #3
1781
vrshrn.u16 d6, q3, #3
1782
vst1.8 {d4}, [r0,:64], r1
1783
vst1.8 {d6}, [r0,:64], r1
1789
function ff_put_vp8_bilin8_hv_neon, export=1
1790
ldr r3, [sp, #4] @ mx
1794
ldr r3, [sp, #8] @ my
1800
vld1.8 {q2}, [r2], r1
1801
vext.8 d5, d4, d5, #1
1804
vrshrn.u16 d22, q9, #3
1807
vld1.8 {q3}, [r2], r1
1808
vext.8 d7, d6, d7, #1
1811
vld1.8 {q2}, [r2], r1
1812
vext.8 d5, d4, d5, #1
1815
vrshrn.u16 d16, q8, #3
1816
vmull.u8 q10, d22, d3
1817
vmlal.u8 q10, d16, d2
1818
vrshrn.u16 d22, q9, #3
1819
vmull.u8 q12, d16, d3
1820
vmlal.u8 q12, d22, d2
1821
vrshrn.u16 d20, q10, #3
1822
vst1.8 {d20}, [r0,:64], r1
1823
vrshrn.u16 d23, q12, #3
1824
vst1.8 {d23}, [r0,:64], r1
1830
function ff_put_vp8_bilin4_h_neon, export=1
1831
ldr r3, [sp, #4] @ mx
1838
vld1.8 {d2}, [r2], r1
1839
vext.8 d3, d2, d3, #1
1840
vld1.8 {d6}, [r2], r1
1841
vext.8 d7, d6, d7, #1
1845
vrshrn.u16 d4, q2, #3
1846
vst1.32 {d4[0]}, [r0,:32], r1
1847
vst1.32 {d4[1]}, [r0,:32], r1
1853
function ff_put_vp8_bilin4_v_neon, export=1
1854
ldr r3, [sp, #8] @ my
1859
vld1.32 {d2[]}, [r2], r1
1861
vld1.32 {d3[]}, [r2]
1862
vld1.32 {d2[1]}, [r2], r1
1863
vld1.32 {d3[1]}, [r2], r1
1867
vrshrn.u16 d4, q2, #3
1868
vst1.32 {d4[0]}, [r0,:32], r1
1869
vst1.32 {d4[1]}, [r0,:32], r1
1876
function ff_put_vp8_bilin4_hv_neon, export=1
1877
ldr r3, [sp, #4] @ mx
1881
ldr r3, [sp, #8] @ my
1887
vld1.8 {d4}, [r2], r1
1888
vext.8 d5, d4, d4, #1
1891
vrshrn.u16 d22, q9, #3
1894
vld1.8 {d6}, [r2], r1
1895
vext.8 d7, d6, d6, #1
1896
vld1.8 {d4}, [r2], r1
1897
vext.8 d5, d4, d4, #1
1901
vrshrn.u16 d16, q8, #3
1902
vmull.u8 q10, d16, d2
1904
vmlal.u8 q10, d22, d3
1906
vrshrn.u16 d20, q10, #3
1907
vst1.32 {d20[0]}, [r0,:32], r1
1908
vst1.32 {d20[1]}, [r0,:32], r1