2
* ARM NEON optimised DSP functions
3
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5
* This file is part of FFmpeg.
7
* FFmpeg is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public
9
* License as published by the Free Software Foundation; either
10
* version 2.1 of the License, or (at your option) any later version.
12
* FFmpeg is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Lesser General Public License for more details.
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with FFmpeg; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32
1: vld1.64 {d0, d1}, [r1], r2
33
vld1.64 {d2, d3}, [r1], r2
34
vld1.64 {d4, d5}, [r1], r2
36
vld1.64 {d6, d7}, [r1], r2
41
vld1.64 {d16,d17}, [ip,:128], r2
43
vld1.64 {d18,d19}, [ip,:128], r2
45
vld1.64 {d20,d21}, [ip,:128], r2
47
vld1.64 {d22,d23}, [ip,:128], r2
51
vst1.64 {d0, d1}, [r0,:128], r2
52
vst1.64 {d2, d3}, [r0,:128], r2
53
vst1.64 {d4, d5}, [r0,:128], r2
54
vst1.64 {d6, d7}, [r0,:128], r2
59
.macro pixels16_x2 vhadd=vrhadd.u8
60
1: vld1.64 {d0-d2}, [r1], r2
61
vld1.64 {d4-d6}, [r1], r2
69
vst1.64 {d0, d1}, [r0,:128], r2
70
vst1.64 {d4, d5}, [r0,:128], r2
75
.macro pixels16_y2 vhadd=vrhadd.u8
76
vld1.64 {d0, d1}, [r1], r2
77
vld1.64 {d2, d3}, [r1], r2
80
vld1.64 {d0, d1}, [r1], r2
82
vld1.64 {d2, d3}, [r1], r2
85
vst1.64 {d4, d5}, [r0,:128], r2
86
vst1.64 {d6, d7}, [r0,:128], r2
91
.macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
92
vld1.64 {d0-d2}, [r1], r2
93
vld1.64 {d4-d6}, [r1], r2
100
vext.8 q3, q2, q3, #1
106
vld1.64 {d0-d2}, [r1], r2
110
vadd.u16 q12, q12, q13
112
vext.8 q15, q0, q1, #1
113
vadd.u16 q1 , q10, q11
120
vld1.64 {d2-d4}, [r1], r2
121
vaddl.u8 q10, d1, d31
122
vst1.64 {d28,d29}, [r0,:128], r2
126
vadd.u16 q12, q12, q13
128
vext.8 q2, q1, q2, #1
129
vadd.u16 q0, q10, q11
137
vst1.64 {d30,d31}, [r0,:128], r2
143
1: vld1.64 {d0}, [r1], r2
144
vld1.64 {d1}, [r1], r2
145
vld1.64 {d2}, [r1], r2
147
vld1.64 {d3}, [r1], r2
152
vld1.64 {d4}, [r0,:64], r2
154
vld1.64 {d5}, [r0,:64], r2
156
vld1.64 {d6}, [r0,:64], r2
158
vld1.64 {d7}, [r0,:64], r2
160
sub r0, r0, r2, lsl #2
163
vst1.64 {d0}, [r0,:64], r2
164
vst1.64 {d1}, [r0,:64], r2
165
vst1.64 {d2}, [r0,:64], r2
166
vst1.64 {d3}, [r0,:64], r2
171
.macro pixels8_x2 vhadd=vrhadd.u8
172
1: vld1.64 {d0, d1}, [r1], r2
173
vext.8 d1, d0, d1, #1
174
vld1.64 {d2, d3}, [r1], r2
175
vext.8 d3, d2, d3, #1
181
vst1.64 {d0}, [r0,:64], r2
182
vst1.64 {d1}, [r0,:64], r2
187
.macro pixels8_y2 vhadd=vrhadd.u8
188
vld1.64 {d0}, [r1], r2
189
vld1.64 {d1}, [r1], r2
192
vld1.64 {d0}, [r1], r2
194
vld1.64 {d1}, [r1], r2
197
vst1.64 {d4}, [r0,:64], r2
198
vst1.64 {d5}, [r0,:64], r2
203
.macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
204
vld1.64 {d0, d1}, [r1], r2
205
vld1.64 {d2, d3}, [r1], r2
211
vext.8 d4, d0, d1, #1
212
vext.8 d6, d2, d3, #1
216
vld1.64 {d0, d1}, [r1], r2
219
vext.8 d4, d0, d1, #1
221
vadd.u16 q10, q10, q11
225
vld1.64 {d2, d3}, [r1], r2
229
vadd.u16 q10, q10, q11
231
vst1.64 {d5}, [r0,:64], r2
233
vext.8 d6, d2, d3, #1
235
vst1.64 {d7}, [r0,:64], r2
240
.macro pixfunc pfx name suf rnd_op args:vararg
241
function ff_\pfx\name\suf\()_neon, export=1
246
.macro pixfunc2 pfx name args:vararg
248
pixfunc \pfx \name \args
251
function ff_put_h264_qpel16_mc00_neon, export=1
255
pixfunc put_ pixels16
256
pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
257
pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
258
pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
260
function ff_avg_h264_qpel16_mc00_neon, export=1
264
pixfunc avg_ pixels16,, 1
266
function ff_put_h264_qpel8_mc00_neon, export=1
271
pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
272
pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
273
pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
275
function ff_avg_h264_qpel8_mc00_neon, export=1
279
pixfunc avg_ pixels8,, 1
281
function ff_put_pixels_clamped_neon, export=1
282
vld1.64 {d16-d19}, [r0,:128]!
284
vld1.64 {d20-d23}, [r0,:128]!
286
vld1.64 {d24-d27}, [r0,:128]!
288
vld1.64 {d28-d31}, [r0,:128]!
290
vst1.64 {d0}, [r1,:64], r2
292
vst1.64 {d1}, [r1,:64], r2
294
vst1.64 {d2}, [r1,:64], r2
296
vst1.64 {d3}, [r1,:64], r2
298
vst1.64 {d4}, [r1,:64], r2
299
vst1.64 {d5}, [r1,:64], r2
300
vst1.64 {d6}, [r1,:64], r2
301
vst1.64 {d7}, [r1,:64], r2
305
function ff_put_signed_pixels_clamped_neon, export=1
307
vld1.64 {d16-d17}, [r0,:128]!
309
vld1.64 {d18-d19}, [r0,:128]!
311
vld1.64 {d16-d17}, [r0,:128]!
313
vld1.64 {d18-d19}, [r0,:128]!
315
vld1.64 {d20-d21}, [r0,:128]!
317
vld1.64 {d22-d23}, [r0,:128]!
319
vst1.64 {d0}, [r1,:64], r2
321
vst1.64 {d1}, [r1,:64], r2
323
vst1.64 {d2}, [r1,:64], r2
325
vld1.64 {d24-d25}, [r0,:128]!
327
vld1.64 {d26-d27}, [r0,:128]!
330
vst1.64 {d3}, [r1,:64], r2
332
vst1.64 {d4}, [r1,:64], r2
334
vst1.64 {d5}, [r1,:64], r2
337
vst1.64 {d6}, [r1,:64], r2
338
vst1.64 {d7}, [r1,:64], r2
342
function ff_add_pixels_clamped_neon, export=1
344
vld1.64 {d16}, [r1,:64], r2
345
vld1.64 {d0-d1}, [r0,:128]!
347
vld1.64 {d17}, [r1,:64], r2
348
vld1.64 {d2-d3}, [r0,:128]!
350
vld1.64 {d18}, [r1,:64], r2
352
vld1.64 {d4-d5}, [r0,:128]!
354
vst1.64 {d0}, [r3,:64], r2
356
vld1.64 {d19}, [r1,:64], r2
357
vld1.64 {d6-d7}, [r0,:128]!
360
vst1.64 {d2}, [r3,:64], r2
361
vld1.64 {d16}, [r1,:64], r2
363
vld1.64 {d0-d1}, [r0,:128]!
365
vst1.64 {d4}, [r3,:64], r2
366
vld1.64 {d17}, [r1,:64], r2
367
vld1.64 {d2-d3}, [r0,:128]!
369
vst1.64 {d6}, [r3,:64], r2
371
vld1.64 {d18}, [r1,:64], r2
372
vld1.64 {d4-d5}, [r0,:128]!
374
vst1.64 {d0}, [r3,:64], r2
376
vld1.64 {d19}, [r1,:64], r2
378
vld1.64 {d6-d7}, [r0,:128]!
380
vst1.64 {d2}, [r3,:64], r2
382
vst1.64 {d4}, [r3,:64], r2
383
vst1.64 {d6}, [r3,:64], r2
387
function ff_float_to_int16_neon, export=1
389
vld1.64 {d0-d1}, [r1,:128]!
390
vcvt.s32.f32 q8, q0, #16
391
vld1.64 {d2-d3}, [r1,:128]!
392
vcvt.s32.f32 q9, q1, #16
397
vshrn.s32 d4, q8, #16
398
vld1.64 {d0-d1}, [r1,:128]!
399
vcvt.s32.f32 q0, q0, #16
400
vshrn.s32 d5, q9, #16
401
vld1.64 {d2-d3}, [r1,:128]!
402
vcvt.s32.f32 q1, q1, #16
403
vshrn.s32 d6, q0, #16
404
vst1.64 {d4-d5}, [r0,:128]!
405
vshrn.s32 d7, q1, #16
406
vld1.64 {d16-d17},[r1,:128]!
407
vcvt.s32.f32 q8, q8, #16
408
vld1.64 {d18-d19},[r1,:128]!
409
vcvt.s32.f32 q9, q9, #16
410
vst1.64 {d6-d7}, [r0,:128]!
414
2: vld1.64 {d0-d1}, [r1,:128]!
415
vshrn.s32 d4, q8, #16
416
vcvt.s32.f32 q0, q0, #16
417
vld1.64 {d2-d3}, [r1,:128]!
418
vshrn.s32 d5, q9, #16
419
vcvt.s32.f32 q1, q1, #16
420
vshrn.s32 d6, q0, #16
421
vst1.64 {d4-d5}, [r0,:128]!
422
vshrn.s32 d7, q1, #16
423
vst1.64 {d6-d7}, [r0,:128]!
425
3: vshrn.s32 d4, q8, #16
426
vshrn.s32 d5, q9, #16
427
vst1.64 {d4-d5}, [r0,:128]!
431
function ff_float_to_int16_interleave_neon, export=1
434
blt ff_float_to_int16_neon
441
vld1.64 {d0-d1}, [r3,:128]!
442
vcvt.s32.f32 q8, q0, #16
443
vld1.64 {d2-d3}, [r3,:128]!
444
vcvt.s32.f32 q9, q1, #16
445
vld1.64 {d20-d21},[r1,:128]!
446
vcvt.s32.f32 q10, q10, #16
447
vld1.64 {d22-d23},[r1,:128]!
448
vcvt.s32.f32 q11, q11, #16
453
vld1.64 {d0-d1}, [r3,:128]!
454
vcvt.s32.f32 q0, q0, #16
456
vld1.64 {d2-d3}, [r3,:128]!
457
vcvt.s32.f32 q1, q1, #16
458
vld1.64 {d24-d25},[r1,:128]!
459
vcvt.s32.f32 q12, q12, #16
460
vld1.64 {d26-d27},[r1,:128]!
462
vst1.64 {d20-d21},[r0,:128]!
463
vcvt.s32.f32 q13, q13, #16
464
vst1.64 {d22-d23},[r0,:128]!
466
vld1.64 {d16-d17},[r3,:128]!
468
vst1.64 {d24-d25},[r0,:128]!
469
vcvt.s32.f32 q8, q8, #16
470
vld1.64 {d18-d19},[r3,:128]!
471
vcvt.s32.f32 q9, q9, #16
472
vld1.64 {d20-d21},[r1,:128]!
473
vcvt.s32.f32 q10, q10, #16
474
vld1.64 {d22-d23},[r1,:128]!
475
vcvt.s32.f32 q11, q11, #16
476
vst1.64 {d26-d27},[r0,:128]!
480
2: vsri.32 q10, q8, #16
481
vld1.64 {d0-d1}, [r3,:128]!
482
vcvt.s32.f32 q0, q0, #16
483
vld1.64 {d2-d3}, [r3,:128]!
484
vcvt.s32.f32 q1, q1, #16
485
vld1.64 {d24-d25},[r1,:128]!
486
vcvt.s32.f32 q12, q12, #16
488
vld1.64 {d26-d27},[r1,:128]!
489
vcvt.s32.f32 q13, q13, #16
490
vst1.64 {d20-d21},[r0,:128]!
492
vst1.64 {d22-d23},[r0,:128]!
494
vst1.64 {d24-d27},[r0,:128]!
496
3: vsri.32 q10, q8, #16
498
vst1.64 {d20-d23},[r0,:128]!
507
5: ldmia r1!, {r4-r7}
510
vld1.64 {d16-d17},[r4,:128]!
511
vcvt.s32.f32 q8, q8, #16
512
vld1.64 {d18-d19},[r5,:128]!
513
vcvt.s32.f32 q9, q9, #16
514
vld1.64 {d20-d21},[r6,:128]!
515
vcvt.s32.f32 q10, q10, #16
516
vld1.64 {d22-d23},[r7,:128]!
517
vcvt.s32.f32 q11, q11, #16
519
vld1.64 {d0-d1}, [r4,:128]!
520
vcvt.s32.f32 q0, q0, #16
522
vld1.64 {d2-d3}, [r5,:128]!
523
vcvt.s32.f32 q1, q1, #16
524
vsri.32 q11, q10, #16
525
vld1.64 {d4-d5}, [r6,:128]!
526
vcvt.s32.f32 q2, q2, #16
528
vld1.64 {d6-d7}, [r7,:128]!
529
vcvt.s32.f32 q3, q3, #16
531
vst1.64 {d18}, [r8], ip
533
vst1.64 {d22}, [r8], ip
535
vst1.64 {d19}, [r8], ip
537
vst1.64 {d23}, [r8], ip
540
vld1.64 {d16-d17},[r4,:128]!
541
vcvt.s32.f32 q8, q8, #16
542
vst1.64 {d2}, [r8], ip
543
vld1.64 {d18-d19},[r5,:128]!
544
vcvt.s32.f32 q9, q9, #16
545
vst1.64 {d6}, [r8], ip
546
vld1.64 {d20-d21},[r6,:128]!
547
vcvt.s32.f32 q10, q10, #16
548
vst1.64 {d3}, [r8], ip
549
vld1.64 {d22-d23},[r7,:128]!
550
vcvt.s32.f32 q11, q11, #16
551
vst1.64 {d7}, [r8], ip
553
7: vst1.64 {d2}, [r8], ip
554
vst1.64 {d6}, [r8], ip
555
vst1.64 {d3}, [r8], ip
556
vst1.64 {d7}, [r8], ip
570
vld1.64 {d16-d17},[r4,:128]!
571
vcvt.s32.f32 q8, q8, #16
572
vld1.64 {d18-d19},[r5,:128]!
573
vcvt.s32.f32 q9, q9, #16
574
vld1.64 {d20-d21},[r4,:128]!
575
vcvt.s32.f32 q10, q10, #16
576
vld1.64 {d22-d23},[r5,:128]!
577
vcvt.s32.f32 q11, q11, #16
581
vsri.32 d18, d16, #16
582
vsri.32 d19, d17, #16
583
vld1.64 {d16-d17},[r4,:128]!
584
vcvt.s32.f32 q8, q8, #16
585
vst1.32 {d18[0]}, [r8], ip
586
vsri.32 d22, d20, #16
587
vst1.32 {d18[1]}, [r8], ip
588
vsri.32 d23, d21, #16
589
vst1.32 {d19[0]}, [r8], ip
590
vst1.32 {d19[1]}, [r8], ip
591
vld1.64 {d18-d19},[r5,:128]!
592
vcvt.s32.f32 q9, q9, #16
593
vst1.32 {d22[0]}, [r8], ip
594
vst1.32 {d22[1]}, [r8], ip
595
vld1.64 {d20-d21},[r4,:128]!
596
vcvt.s32.f32 q10, q10, #16
597
vst1.32 {d23[0]}, [r8], ip
598
vst1.32 {d23[1]}, [r8], ip
599
vld1.64 {d22-d23},[r5,:128]!
600
vcvt.s32.f32 q11, q11, #16
602
vld1.64 {d0-d1}, [r4,:128]!
603
vcvt.s32.f32 q0, q0, #16
604
vsri.32 d18, d16, #16
605
vld1.64 {d2-d3}, [r5,:128]!
606
vcvt.s32.f32 q1, q1, #16
607
vsri.32 d19, d17, #16
608
vld1.64 {d4-d5}, [r4,:128]!
609
vcvt.s32.f32 q2, q2, #16
610
vld1.64 {d6-d7}, [r5,:128]!
611
vcvt.s32.f32 q3, q3, #16
612
vst1.32 {d18[0]}, [r8], ip
613
vsri.32 d22, d20, #16
614
vst1.32 {d18[1]}, [r8], ip
615
vsri.32 d23, d21, #16
616
vst1.32 {d19[0]}, [r8], ip
618
vst1.32 {d19[1]}, [r8], ip
620
vst1.32 {d22[0]}, [r8], ip
622
vst1.32 {d22[1]}, [r8], ip
624
vst1.32 {d23[0]}, [r8], ip
625
vst1.32 {d23[1]}, [r8], ip
627
vld1.64 {d16-d17},[r4,:128]!
628
vcvt.s32.f32 q8, q8, #16
629
vst1.32 {d2[0]}, [r8], ip
630
vst1.32 {d2[1]}, [r8], ip
631
vld1.64 {d18-d19},[r5,:128]!
632
vcvt.s32.f32 q9, q9, #16
633
vst1.32 {d3[0]}, [r8], ip
634
vst1.32 {d3[1]}, [r8], ip
635
vld1.64 {d20-d21},[r4,:128]!
636
vcvt.s32.f32 q10, q10, #16
637
vst1.32 {d6[0]}, [r8], ip
638
vst1.32 {d6[1]}, [r8], ip
639
vld1.64 {d22-d23},[r5,:128]!
640
vcvt.s32.f32 q11, q11, #16
641
vst1.32 {d7[0]}, [r8], ip
642
vst1.32 {d7[1]}, [r8], ip
644
6: vst1.32 {d2[0]}, [r8], ip
645
vst1.32 {d2[1]}, [r8], ip
646
vst1.32 {d3[0]}, [r8], ip
647
vst1.32 {d3[1]}, [r8], ip
648
vst1.32 {d6[0]}, [r8], ip
649
vst1.32 {d6[1]}, [r8], ip
650
vst1.32 {d7[0]}, [r8], ip
651
vst1.32 {d7[1]}, [r8], ip
653
7: vsri.32 d18, d16, #16
654
vsri.32 d19, d17, #16
655
vst1.32 {d18[0]}, [r8], ip
656
vsri.32 d22, d20, #16
657
vst1.32 {d18[1]}, [r8], ip
658
vsri.32 d23, d21, #16
659
vst1.32 {d19[0]}, [r8], ip
660
vst1.32 {d19[1]}, [r8], ip
661
vst1.32 {d22[0]}, [r8], ip
662
vst1.32 {d22[1]}, [r8], ip
663
vst1.32 {d23[0]}, [r8], ip
664
vst1.32 {d23[1]}, [r8], ip
674
vld1.64 {d0-d1}, [r4,:128]!
675
vcvt.s32.f32 q0, q0, #16
676
vld1.64 {d2-d3}, [r4,:128]!
677
vcvt.s32.f32 q1, q1, #16
680
vld1.64 {d4-d5}, [r4,:128]!
681
vcvt.s32.f32 q2, q2, #16
682
vld1.64 {d6-d7}, [r4,:128]!
683
vcvt.s32.f32 q3, q3, #16
684
vst1.16 {d0[1]}, [r5,:16], ip
685
vst1.16 {d0[3]}, [r5,:16], ip
686
vst1.16 {d1[1]}, [r5,:16], ip
687
vst1.16 {d1[3]}, [r5,:16], ip
688
vst1.16 {d2[1]}, [r5,:16], ip
689
vst1.16 {d2[3]}, [r5,:16], ip
690
vst1.16 {d3[1]}, [r5,:16], ip
691
vst1.16 {d3[3]}, [r5,:16], ip
693
vld1.64 {d0-d1}, [r4,:128]!
694
vcvt.s32.f32 q0, q0, #16
695
vld1.64 {d2-d3}, [r4,:128]!
696
vcvt.s32.f32 q1, q1, #16
697
7: vst1.16 {d4[1]}, [r5,:16], ip
698
vst1.16 {d4[3]}, [r5,:16], ip
699
vst1.16 {d5[1]}, [r5,:16], ip
700
vst1.16 {d5[3]}, [r5,:16], ip
701
vst1.16 {d6[1]}, [r5,:16], ip
702
vst1.16 {d6[3]}, [r5,:16], ip
703
vst1.16 {d7[1]}, [r5,:16], ip
704
vst1.16 {d7[3]}, [r5,:16], ip
708
vst1.16 {d0[1]}, [r5,:16], ip
709
vst1.16 {d0[3]}, [r5,:16], ip
710
vst1.16 {d1[1]}, [r5,:16], ip
711
vst1.16 {d1[3]}, [r5,:16], ip
712
vst1.16 {d2[1]}, [r5,:16], ip
713
vst1.16 {d2[3]}, [r5,:16], ip
714
vst1.16 {d3[1]}, [r5,:16], ip
715
vst1.16 {d3[3]}, [r5,:16], ip
717
vld1.64 {d0-d1}, [r4,:128]!
718
vcvt.s32.f32 q0, q0, #16
719
vld1.64 {d2-d3}, [r4,:128]!
720
vcvt.s32.f32 q1, q1, #16
724
function ff_vector_fmul_neon, export=1
727
vld1.64 {d0-d3}, [r0,:128]!
728
vld1.64 {d4-d7}, [r1,:128]!
735
vld1.64 {d0-d1}, [r0,:128]!
736
vld1.64 {d4-d5}, [r1,:128]!
738
vld1.64 {d2-d3}, [r0,:128]!
739
vld1.64 {d6-d7}, [r1,:128]!
741
vst1.64 {d16-d19},[r3,:128]!
742
vld1.64 {d0-d1}, [r0,:128]!
743
vld1.64 {d4-d5}, [r1,:128]!
745
vld1.64 {d2-d3}, [r0,:128]!
746
vld1.64 {d6-d7}, [r1,:128]!
748
vst1.64 {d20-d23},[r3,:128]!
752
2: vld1.64 {d0-d1}, [r0,:128]!
753
vld1.64 {d4-d5}, [r1,:128]!
754
vst1.64 {d16-d17},[r3,:128]!
756
vld1.64 {d2-d3}, [r0,:128]!
757
vld1.64 {d6-d7}, [r1,:128]!
758
vst1.64 {d18-d19},[r3,:128]!
760
3: vst1.64 {d16-d19},[r3,:128]!
764
function ff_vector_fmul_window_neon, export=1
765
VFP vdup.32 q8, d0[0]
766
NOVFP vld1.32 {d16[],d17[]}, [sp,:32]
768
VFP ldr lr, [sp, #12]
769
NOVFP ldr lr, [sp, #16]
772
add r2, r2, r5, lsl #2
773
add r4, r3, r5, lsl #3
774
add ip, r0, r5, lsl #3
776
vld1.64 {d0,d1}, [r1,:128]!
777
vld1.64 {d2,d3}, [r2,:128], r5
778
vld1.64 {d4,d5}, [r3,:128]!
779
vld1.64 {d6,d7}, [r4,:128], r5
791
vld1.64 {d0,d1}, [r1,:128]!
793
vld1.64 {d18,d19},[r2,:128], r5
795
vld1.64 {d24,d25},[r3,:128]!
797
vld1.64 {d6,d7}, [r4,:128], r5
802
vst1.64 {d20,d21},[r0,:128]!
803
vst1.64 {d22,d23},[ip,:128], r5
805
2: vmla.f32 d22, d3, d7
811
vst1.64 {d20,d21},[r0,:128]!
812
vst1.64 {d22,d23},[ip,:128], r5
816
#if CONFIG_VORBIS_DECODER
817
function ff_vorbis_inverse_coupling_neon, export=1
824
vld1.32 {d24-d25},[r1,:128]!
825
vld1.32 {d22-d23},[r0,:128]!
831
vadd.f32 q12, q11, q2
832
vsub.f32 q11, q11, q3
833
1: vld1.32 {d2-d3}, [r1,:128]!
834
vld1.32 {d0-d1}, [r0,:128]!
838
vst1.32 {d24-d25},[r3, :128]!
839
vst1.32 {d22-d23},[r12,:128]!
846
vld1.32 {d24-d25},[r1,:128]!
847
vld1.32 {d22-d23},[r0,:128]!
851
vst1.32 {d2-d3}, [r3, :128]!
852
vst1.32 {d0-d1}, [r12,:128]!
855
vadd.f32 q12, q11, q2
856
vsub.f32 q11, q11, q3
859
2: vst1.32 {d2-d3}, [r3, :128]!
860
vst1.32 {d0-d1}, [r12,:128]!
863
3: vld1.32 {d2-d3}, [r1,:128]
864
vld1.32 {d0-d1}, [r0,:128]
872
vst1.32 {d2-d3}, [r0,:128]!
873
vst1.32 {d0-d1}, [r1,:128]!
878
function ff_vector_fmul_scalar_neon, export=1
881
VFP vdup.32 q8, d0[0]
885
vld1.32 {q0},[r1,:128]!
886
vld1.32 {q1},[r1,:128]!
887
1: vmul.f32 q0, q0, q8
888
vld1.32 {q2},[r1,:128]!
890
vld1.32 {q3},[r1,:128]!
892
vst1.32 {q0},[r0,:128]!
894
vst1.32 {q1},[r0,:128]!
897
vld1.32 {q0},[r1,:128]!
898
vst1.32 {q2},[r0,:128]!
899
vld1.32 {q1},[r1,:128]!
900
vst1.32 {q3},[r0,:128]!
902
2: vst1.32 {q2},[r0,:128]!
903
vst1.32 {q3},[r0,:128]!
906
3: vld1.32 {q0},[r1,:128]!
908
vst1.32 {q0},[r0,:128]!
915
function ff_vector_fmul_sv_scalar_2_neon, export=1
916
VFP vdup.32 d16, d0[0]
917
NOVFP vdup.32 d16, r3
919
vld1.32 {d0},[r1,:64]!
920
vld1.32 {d1},[r1,:64]!
925
vld1.32 {d2},[r12,:64]
927
vld1.32 {d3},[r12,:64]
931
vld1.32 {d0},[r1,:64]!
932
vld1.32 {d1},[r1,:64]!
933
vst1.32 {d4},[r0,:64]!
934
vst1.32 {d5},[r0,:64]!
936
2: vst1.32 {d4},[r0,:64]!
937
vst1.32 {d5},[r0,:64]!
941
function ff_vector_fmul_sv_scalar_4_neon, export=1
942
VFP vdup.32 q10, d0[0]
943
NOVFP vdup.32 q10, r3
948
vld1.32 {q0},[r1,:128]!
949
vld1.32 {q2},[r1,:128]!
951
vld1.32 {q1},[r12,:128]
953
vld1.32 {q3},[r12,:128]
960
vld1.32 {q0},[r1,:128]!
961
vld1.32 {q2},[r1,:128]!
962
vst1.32 {q8},[r0,:128]!
963
vst1.32 {q9},[r0,:128]!
965
2: vst1.32 {q8},[r0,:128]!
966
vst1.32 {q9},[r0,:128]!
969
3: vld1.32 {q0},[r1,:128]!
971
vld1.32 {q1},[r12,:128]
974
vst1.32 {q0},[r0,:128]!
980
function ff_sv_fmul_scalar_2_neon, export=1
983
VFP vdup.32 q8, d0[0]
986
vld1.32 {d0},[r12,:64]
988
vld1.32 {d1},[r12,:64]
989
1: vmul.f32 q1, q0, q8
993
vld1.32 {d0},[r12,:64]
995
vld1.32 {d1},[r12,:64]
996
vst1.32 {q1},[r0,:128]!
998
2: vst1.32 {q1},[r0,:128]!
1003
function ff_sv_fmul_scalar_4_neon, export=1
1006
VFP vdup.32 q8, d0[0]
1007
NOVFP vdup.32 q8, r2
1008
1: ldr r12, [r1], #4
1009
vld1.32 {q0},[r12,:128]
1011
vst1.32 {q0},[r0,:128]!
1018
function ff_butterflies_float_neon, export=1
1019
1: vld1.32 {q0},[r0,:128]
1020
vld1.32 {q1},[r1,:128]
1023
vst1.32 {q2},[r1,:128]!
1024
vst1.32 {q1},[r0,:128]!
1030
function ff_scalarproduct_float_neon, export=1
1032
1: vld1.32 {q0},[r0,:128]!
1033
vld1.32 {q1},[r1,:128]!
1038
vpadd.f32 d0, d0, d0
1039
NOVFP vmov.32 r0, d0[0]
1043
function ff_int32_to_float_fmul_scalar_neon, export=1
1044
VFP vdup.32 q0, d0[0]
1046
NOVFP vdup.32 q0, r2
1049
vld1.32 {q1},[r1,:128]!
1051
vld1.32 {q2},[r1,:128]!
1053
1: subs len, len, #8
1056
vmul.f32 q10, q8, q0
1058
vld1.32 {q1},[r1,:128]!
1060
vld1.32 {q2},[r1,:128]!
1062
vst1.32 {q9}, [r0,:128]!
1063
vst1.32 {q10},[r0,:128]!
1065
2: vst1.32 {q9}, [r0,:128]!
1066
vst1.32 {q10},[r0,:128]!
1071
function ff_vector_fmul_reverse_neon, export=1
1072
add r2, r2, r3, lsl #2
1075
vld1.32 {q0-q1}, [r1,:128]!
1076
vld1.32 {q2-q3}, [r2,:128], r12
1079
vmul.f32 d16, d0, d7
1080
vmul.f32 d17, d1, d6
1083
vmul.f32 d18, d2, d5
1084
vmul.f32 d19, d3, d4
1087
vld1.32 {q0-q1}, [r1,:128]!
1088
vld1.32 {q2-q3}, [r2,:128], r12
1089
vst1.32 {q8-q9}, [r0,:128]!
1091
2: vst1.32 {q8-q9}, [r0,:128]!
1095
function ff_vector_fmul_add_neon, export=1
1097
vld1.32 {q0-q1}, [r1,:128]!
1098
vld1.32 {q8-q9}, [r2,:128]!
1099
vld1.32 {q2-q3}, [r3,:128]!
1100
vmul.f32 q10, q0, q8
1101
vmul.f32 q11, q1, q9
1102
1: vadd.f32 q12, q2, q10
1103
vadd.f32 q13, q3, q11
1109
vld1.32 {q0}, [r1,:128]!
1110
vld1.32 {q8}, [r2,:128]!
1111
vmul.f32 q10, q0, q8
1112
vld1.32 {q1}, [r1,:128]!
1113
vld1.32 {q9}, [r2,:128]!
1114
vmul.f32 q11, q1, q9
1115
vld1.32 {q2-q3}, [r3,:128]!
1116
vst1.32 {q12-q13},[r0,:128]!
1118
2: vst1.32 {q12-q13},[r0,:128]!
1122
function ff_vector_clipf_neon, export=1
1123
VFP vdup.32 q1, d0[1]
1124
VFP vdup.32 q0, d0[0]
1125
NOVFP vdup.32 q0, r2
1126
NOVFP vdup.32 q1, r3
1128
vld1.f32 {q2},[r1,:128]!
1129
vmin.f32 q10, q2, q1
1130
vld1.f32 {q3},[r1,:128]!
1131
vmin.f32 q11, q3, q1
1132
1: vmax.f32 q8, q10, q0
1133
vmax.f32 q9, q11, q0
1136
vld1.f32 {q2},[r1,:128]!
1137
vmin.f32 q10, q2, q1
1138
vld1.f32 {q3},[r1,:128]!
1139
vmin.f32 q11, q3, q1
1140
vst1.f32 {q8},[r0,:128]!
1141
vst1.f32 {q9},[r0,:128]!
1143
2: vst1.f32 {q8},[r0,:128]!
1144
vst1.f32 {q9},[r0,:128]!