1
From 2d316e271f8837e0888b664ae45f6174f827a982 Mon Sep 17 00:00:00 2001
2
From: =?utf-8?q?Lo=C3=AFc=20Minier?= <lool@dooz.org>
3
Date: Sat, 10 Oct 2009 12:58:17 +0200
4
Subject: [PATCH 16/27] ARM: NEON optimised FFT and MDCT
6
Vorbis and AC3 ~3x faster.
8
Parts by Naotoshi Nojiri, naonoj gmail
10
git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19806 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
16
libavcodec/Makefile | 5 +
17
libavcodec/arm/fft_neon.S | 369 ++++++++++++++++++++++++++++++++++++++++++++
18
libavcodec/arm/mdct_neon.S | 178 +++++++++++++++++++++
19
libavcodec/dsputil.h | 4 +
20
libavcodec/fft.c | 10 +-
21
5 files changed, 565 insertions(+), 1 deletions(-)
22
create mode 100644 libavcodec/arm/fft_neon.S
23
create mode 100644 libavcodec/arm/mdct_neon.S
25
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
26
index a213309..02e0e8a 100644
27
--- a/libavcodec/Makefile
28
+++ b/libavcodec/Makefile
29
@@ -480,6 +480,11 @@ OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
30
arm/mpegvideo_iwmmxt.o \
32
NEON-OBJS-$(CONFIG_THEORA_DECODER) += arm/vp3dsp_neon.o
34
+NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
36
+NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
38
NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
40
OBJS-$(HAVE_NEON) += arm/dsputil_neon.o \
41
diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S
43
index 0000000..6ed5789
45
+++ b/libavcodec/arm/fft_neon.S
48
+ * ARM NEON optimised FFT
50
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
51
+ * Copyright (c) 2009 Naotoshi Nojiri
53
+ * This file is part of FFmpeg.
55
+ * FFmpeg is free software; you can redistribute it and/or
56
+ * modify it under the terms of the GNU Lesser General Public
57
+ * License as published by the Free Software Foundation; either
58
+ * version 2.1 of the License, or (at your option) any later version.
60
+ * FFmpeg is distributed in the hope that it will be useful,
61
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
62
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
63
+ * Lesser General Public License for more details.
65
+ * You should have received a copy of the GNU Lesser General Public
66
+ * License along with FFmpeg; if not, write to the Free Software
67
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
72
+#define M_SQRT1_2 0.70710678118654752440
77
+ vld1.32 {d0-d3}, [r0,:128]
79
+ vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
80
+ vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
81
+ vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
82
+ vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
83
+ vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
89
+ vst1.32 {d0-d3}, [r0,:128]
96
+ vld1.32 {d0-d3}, [r1,:128]!
97
+ vld1.32 {d16-d19}, [r1,:128]
99
+ movw r2, #0x04f3 @ sqrt(1/2)
104
+ vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
105
+ vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
107
+ vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
108
+ vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
109
+ vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
111
+ vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
112
+ vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
113
+ vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
114
+ vext.32 q3, q2, q2, #1
115
+ vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
116
+ vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
117
+ vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
118
+ vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
119
+ vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
120
+ vadd.f32 d0, d20, d21
121
+ vsub.f32 d2, d20, d21
122
+ vadd.f32 d1, d22, d23
124
+ vsub.f32 d3, d22, d23
125
+ vsub.f32 d6, d6, d7
126
+ vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
127
+ vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
128
+ vadd.f32 d7, d4, d5
129
+ vsub.f32 d18, d2, d6
130
+ vext.32 q13, q12, q12, #1
131
+ vadd.f32 d2, d2, d6
132
+ vsub.f32 d16, d0, d7
133
+ vadd.f32 d5, d25, d24
134
+ vsub.f32 d4, d26, d27
135
+ vadd.f32 d0, d0, d7
136
+ vsub.f32 d17, d1, d5
137
+ vsub.f32 d19, d3, d4
138
+ vadd.f32 d3, d3, d4
139
+ vadd.f32 d1, d1, d5
141
+ vst1.32 {d16-d19}, [r1,:128]
142
+ vst1.32 {d0-d3}, [r0,:128]
149
+ vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
151
+ vld1.32 {d2-d3}, [r1,:128]
152
+ vext.32 q13, q9, q9, #1
153
+ vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
154
+ vadd.f32 d4, d16, d17
155
+ vsub.f32 d5, d16, d17
156
+ vadd.f32 d18, d18, d19
157
+ vsub.f32 d19, d26, d27
159
+ vadd.f32 d20, d22, d23
160
+ vsub.f32 d22, d22, d23
161
+ vsub.f32 d23, d24, d25
162
+ vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
163
+ vadd.f32 d21, d24, d25
164
+ vmul.f32 d24, d22, d2
165
+ vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
166
+ vmul.f32 d25, d23, d3
167
+ vuzp.32 d16, d17 @ {r0,r1,i0,i1}
168
+ vmul.f32 q1, q11, d2[1]
169
+ vuzp.32 d18, d19 @ {r2,r3,i2,i3}
171
+ vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
172
+ vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
174
+ vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
175
+ vadd.f32 d0, d22, d20
176
+ vadd.f32 d1, d21, d23
177
+ vsub.f32 d2, d21, d23
178
+ vsub.f32 d3, d22, d20
180
+ vext.32 q13, q13, q13, #1
181
+ vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
182
+ vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
183
+ vext.32 q15, q15, q15, #1
184
+ vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
185
+ vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
186
+ vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
187
+ vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
188
+ vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
189
+ vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
190
+ movrel r2, ff_cos_16
191
+ vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
193
+ vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
196
+ vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
197
+ vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
198
+ vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
199
+ vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
200
+ vld1.32 {d4-d5}, [r2,:64]
201
+ vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
202
+ vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
203
+ vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
204
+ vld1.32 {d6-d7}, [r3,:128]
206
+ vmul.f32 q14, q14, d4[1]
207
+ vmul.f32 q1, q1, q3
208
+ vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
209
+ vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
211
+ vadd.f32 d0, d28, d24
212
+ vadd.f32 d1, d25, d29
213
+ vsub.f32 d2, d25, d29
214
+ vsub.f32 d3, d28, d24
215
+ vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
216
+ vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
217
+ vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
219
+ vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
221
+ vmul.f32 q13, q13, d5[0]
223
+ vmul.f32 q15, q15, d5[1]
224
+ vst2.32 {d16-d17},[r0,:128], r1
225
+ vmul.f32 q0, q0, q3
226
+ vst2.32 {d20-d21},[r0,:128], r1
227
+ vmul.f32 q1, q1, q3
228
+ vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
229
+ vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
230
+ vst2.32 {d24-d25},[r0,:128], r1
231
+ vst2.32 {d28-d29},[r0,:128]
234
+ vadd.f32 d0, d30, d26
235
+ vadd.f32 d1, d27, d31
236
+ vsub.f32 d2, d27, d31
237
+ vsub.f32 d3, d30, d26
238
+ vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
239
+ vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
240
+ vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
241
+ vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
242
+ vst2.32 {d18-d19},[r0,:128], r1
243
+ vst2.32 {d22-d23},[r0,:128], r1
244
+ vst2.32 {d26-d27},[r0,:128], r1
245
+ vst2.32 {d30-d31},[r0,:128]
249
+function fft_pass_neon
252
+ lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
253
+ lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
254
+ lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
256
+ add r4, r4, r0 @ &z[o1]
257
+ add r2, r2, r0 @ &z[o2]
258
+ add r3, r3, r0 @ &z[o3]
259
+ vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
261
+ vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
262
+ add r5, r5, r1 @ wim
263
+ vld1.32 {d6-d7}, [r12,:128] @ pmmp
265
+ vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
266
+ sub r5, r5, #4 @ wim--
268
+ vmul.f32 q11, q11, d4[1]
269
+ vmul.f32 q1, q1, q3
270
+ vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
271
+ vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
272
+ vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
273
+ sub r6, r6, #1 @ n--
274
+ vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
276
+ vadd.f32 d0, d22, d20
277
+ vadd.f32 d1, d21, d23
278
+ vsub.f32 d2, d21, d23
279
+ vsub.f32 d3, d22, d20
280
+ vsub.f32 q10, q8, q0
281
+ vadd.f32 q8, q8, q0
282
+ vsub.f32 q11, q9, q1
283
+ vadd.f32 q9, q9, q1
284
+ vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
285
+ vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
286
+ vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
287
+ vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
288
+ sub r5, r5, #8 @ wim -= 2
290
+ vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
291
+ vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
293
+ vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
295
+ vmul.f32 q10, q10, d4[0]
297
+ vmul.f32 q11, q11, d4[1]
298
+ vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
299
+ vmul.f32 q0, q0, q3
300
+ sub r5, r5, #8 @ wim -= 2
301
+ vmul.f32 q1, q1, q3
302
+ vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
303
+ vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
304
+ vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
305
+ subs r6, r6, #1 @ n--
306
+ vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
308
+ vadd.f32 d0, d22, d20
309
+ vadd.f32 d1, d21, d23
310
+ vsub.f32 d2, d21, d23
311
+ vsub.f32 d3, d22, d20
312
+ vsub.f32 q10, q8, q0
313
+ vadd.f32 q8, q8, q0
314
+ vsub.f32 q11, q9, q1
315
+ vadd.f32 q9, q9, q1
316
+ vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
317
+ vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
318
+ vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
319
+ vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
325
+.macro def_fft n, n2, n4
327
+function fft\n\()_neon
331
+ add r0, r4, #\n4*2*8
333
+ add r0, r4, #\n4*3*8
337
+ movrel r1, ff_cos_\n
345
+ def_fft 128, 64, 32
346
+ def_fft 256, 128, 64
347
+ def_fft 512, 256, 128
348
+ def_fft 1024, 512, 256
349
+ def_fft 2048, 1024, 512
350
+ def_fft 4096, 2048, 1024
351
+ def_fft 8192, 4096, 2048
352
+ def_fft 16384, 8192, 4096
353
+ def_fft 32768, 16384, 8192
354
+ def_fft 65536, 32768, 16384
356
+function ff_fft_calc_neon, export=1
359
+ movrel r3, fft_tab_neon
360
+ ldr r3, [r3, r2, lsl #2]
365
+function ff_fft_permute_neon, export=1
368
+ ldr r2, [r0] @ nbits
369
+ ldr r3, [r0, #20] @ tmp_buf
370
+ ldr r0, [r0, #8] @ revtab
374
+ vld1.32 {d0-d1}, [r1,:128]!
377
+ uxtah r4, r3, r4, ror #16
378
+ vst1.32 {d0}, [lr,:64]
379
+ vst1.32 {d1}, [r4,:64]
383
+ sub r1, r1, r2, lsl #3
385
+ vld1.32 {d0-d3}, [r3,:128]!
386
+ vst1.32 {d0-d3}, [r1,:128]!
408
+ .word fft16384_neon
409
+ .word fft32768_neon
410
+ .word fft65536_neon
411
+ .size fft_tab_neon, . - fft_tab_neon
414
+pmmp: .float +1.0, -1.0, -1.0, +1.0
415
+mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
416
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
418
index 0000000..6d1dcfd
420
+++ b/libavcodec/arm/mdct_neon.S
423
+ * ARM NEON optimised MDCT
424
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
426
+ * This file is part of FFmpeg.
428
+ * FFmpeg is free software; you can redistribute it and/or
429
+ * modify it under the terms of the GNU Lesser General Public
430
+ * License as published by the Free Software Foundation; either
431
+ * version 2.1 of the License, or (at your option) any later version.
433
+ * FFmpeg is distributed in the hope that it will be useful,
434
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
435
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
436
+ * Lesser General Public License for more details.
438
+ * You should have received a copy of the GNU Lesser General Public
439
+ * License along with FFmpeg; if not, write to the Free Software
440
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
448
+function ff_imdct_half_neon, export=1
452
+ ldr lr, [r0, #4] @ nbits
453
+ ldr r4, [r0, #8] @ tcos
454
+ ldr r5, [r0, #12] @ tsin
455
+ ldr r3, [r0, #24] @ revtab
456
+ lsl r12, r12, lr @ n = 1 << nbits
457
+ lsr lr, r12, #2 @ n4 = n >> 2
458
+ add r7, r2, r12, lsl #1
462
+ vld1.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
463
+ vld1.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
464
+ vld1.32 {d2}, [r4,:64]! @ d2=c0,c1
465
+ vld1.32 {d3}, [r5,:64]! @ d3=s0,s1
468
+ vmul.f32 d6, d16, d2
469
+ vmul.f32 d7, d0, d2
473
+ vmul.f32 d4, d0, d3
474
+ vmul.f32 d5, d16, d3
475
+ vsub.f32 d4, d6, d4
476
+ vadd.f32 d5, d5, d7
477
+ uxtah r8, r1, r6, ror #16
480
+ vld1.32 {d16-d17},[r7,:128],r12
481
+ vld1.32 {d0-d1}, [r2,:128]!
483
+ vld1.32 {d2}, [r4,:64]!
485
+ vmul.f32 d6, d16, d2
486
+ vld1.32 {d3}, [r5,:64]!
487
+ vmul.f32 d7, d0, d2
488
+ vst2.32 {d4[0],d5[0]}, [r6,:64]
489
+ vst2.32 {d4[1],d5[1]}, [r8,:64]
492
+ vst2.32 {d4[0],d5[0]}, [r6,:64]
493
+ vst2.32 {d4[1],d5[1]}, [r8,:64]
498
+ bl ff_fft_calc_neon
501
+ ldr lr, [r4, #4] @ nbits
502
+ ldr r5, [r4, #12] @ tsin
503
+ ldr r4, [r4, #8] @ tcos
504
+ lsl r12, r12, lr @ n = 1 << nbits
505
+ lsr lr, r12, #3 @ n8 = n >> 3
507
+ add r4, r4, lr, lsl #2
508
+ add r5, r5, lr, lsl #2
509
+ add r6, r6, lr, lsl #3
519
+ vld1.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
520
+ vld1.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
521
+ vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
526
+ vmul.f32 d7, d0, d18
527
+ vld1.32 {d19}, [r5,:64]! @ d19=s2,s3
528
+ vmul.f32 d4, d1, d18
529
+ vld1.32 {d16}, [r1,:64], r12 @ d16=c1,c0
530
+ vmul.f32 d5, d21, d19
531
+ vld1.32 {d17}, [r4,:64]! @ d17=c2,c3
532
+ vmul.f32 d6, d20, d19
533
+ vmul.f32 d22, d1, d16
534
+ vmul.f32 d23, d21, d17
535
+ vmul.f32 d24, d0, d16
536
+ vmul.f32 d25, d20, d17
537
+ vadd.f32 d7, d7, d22
538
+ vadd.f32 d6, d6, d23
539
+ vsub.f32 d4, d4, d24
540
+ vsub.f32 d5, d5, d25
542
+ vld1.32 {d0-d1}, [r3,:128], r7
543
+ vld1.32 {d20-d21},[r6,:128]!
544
+ vld1.32 {d18}, [r2,:64], r12
551
+ vst1.32 {d4-d5}, [r0,:128], r7
552
+ vst1.32 {d6-d7}, [r8,:128]!
559
+ vst1.32 {d4-d5}, [r0,:128]
560
+ vst1.32 {d6-d7}, [r8,:128]
565
+function ff_imdct_calc_neon, export=1
574
+ bl ff_imdct_half_neon
576
+ add r0, r5, r4, lsl #2
577
+ add r1, r5, r4, lsl #1
582
+ vmov.i32 d30, #1<<31
584
+ vld1.32 {d0-d1}, [r2,:128], r3
587
+ vld1.32 {d2-d3}, [r1,:128]!
592
+ vst1.32 {d2}, [r0,:64], r6
593
+ vst1.32 {d3}, [r0,:64], r6
594
+ vst1.32 {d4-d5}, [r5,:128]!
600
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
601
index e9c2bfc..c4abd14 100644
602
--- a/libavcodec/dsputil.h
603
+++ b/libavcodec/dsputil.h
604
@@ -692,11 +692,13 @@ extern FFTSample* ff_cos_tabs[13];
605
int ff_fft_init(FFTContext *s, int nbits, int inverse);
606
void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
607
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
608
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
609
void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
610
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
611
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
612
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
613
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
614
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
617
* Do the permutation needed BEFORE calling ff_fft_calc().
618
@@ -766,6 +768,8 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *inpu
619
void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
620
void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
621
void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
622
+void ff_imdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
623
+void ff_imdct_half_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
624
void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input);
625
void ff_mdct_end(MDCTContext *s);
627
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
628
index 296f634..b05ddda 100644
629
--- a/libavcodec/fft.c
630
+++ b/libavcodec/fft.c
631
@@ -64,6 +64,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
632
float alpha, c1, s1, s2;
634
int av_unused has_vectors;
635
+ int revtab_shift = 0;
637
if (nbits < 2 || nbits > 16)
639
@@ -112,6 +113,12 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
640
s->fft_calc = ff_fft_calc_altivec;
644
+ s->fft_permute = ff_fft_permute_neon;
645
+ s->fft_calc = ff_fft_calc_neon;
646
+ s->imdct_calc = ff_imdct_calc_neon;
647
+ s->imdct_half = ff_imdct_half_neon;
652
@@ -125,7 +132,8 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
656
- s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
657
+ s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] =
659
s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
661
int np, nblocks, np2, l;