~medibuntu-maintainers/ffmpeg/medibuntu.karmic

« back to all changes in this revision

Viewing changes to debian/patches/neon/0016-ARM-NEON-optimised-FFT-and-MDCT.patch

  • Committer: Bazaar Package Importer
  • Author(s): Reinhard Tartler, Reinhard Tartler, Loïc Minier
  • Date: 2009-10-14 09:15:50 UTC
  • Revision ID: james.westby@ubuntu.com-20091014091550-7z06g45xfd40wbsc
Tags: 4:0.5+svn20090706-2ubuntu2
[ Reinhard Tartler ]
* merge from the 'ffmpeg' package
* Make arguments of av_set_pts_info() unsigned.
* update debian/changelog
* use patch for issue1245 from git.ffmpeg.org
* Support constant-quant encoding for libtheora, LP: #356322
* increase swscale compile time width (VOF/VOFW), LP: #443264
* Add missing Conflicts/Replaces to ensure a working upgrade patch from
  intrepid/jaunty. LP: #418785

[ Loïc Minier ]
* Update config for karmic's armel toolchain.
* Enable neon flavour; LP: #383240.
* Update NEON confflags to assume v7 and VFP.
* Add backported NEON patches from ffmpeg trunk; see debian/patches/neon/.
* Pass proper --cpu and --extra-flags on armel.
* Pass -fPIC -DPIC to neon pass.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
From 2d316e271f8837e0888b664ae45f6174f827a982 Mon Sep 17 00:00:00 2001
 
2
From: =?utf-8?q?Lo=C3=AFc=20Minier?= <lool@dooz.org>
 
3
Date: Sat, 10 Oct 2009 12:58:17 +0200
 
4
Subject: [PATCH 16/27] ARM: NEON optimised FFT and MDCT
 
5
 
 
6
Vorbis and AC3 ~3x faster.
 
7
 
 
8
Parts by Naotoshi Nojiri, naonoj gmail
 
9
 
 
10
git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19806 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
 
11
 
 
12
Conflicts:
 
13
 
 
14
        libavcodec/Makefile
 
15
---
 
16
 libavcodec/Makefile        |    5 +
 
17
 libavcodec/arm/fft_neon.S  |  369 ++++++++++++++++++++++++++++++++++++++++++++
 
18
 libavcodec/arm/mdct_neon.S |  178 +++++++++++++++++++++
 
19
 libavcodec/dsputil.h       |    4 +
 
20
 libavcodec/fft.c           |   10 +-
 
21
 5 files changed, 565 insertions(+), 1 deletions(-)
 
22
 create mode 100644 libavcodec/arm/fft_neon.S
 
23
 create mode 100644 libavcodec/arm/mdct_neon.S
 
24
 
 
25
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
 
26
index a213309..02e0e8a 100644
 
27
--- a/libavcodec/Makefile
 
28
+++ b/libavcodec/Makefile
 
29
@@ -480,6 +480,11 @@ OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
 
30
                                           arm/mpegvideo_iwmmxt.o        \
 
31
 
 
32
 NEON-OBJS-$(CONFIG_THEORA_DECODER)     += arm/vp3dsp_neon.o
 
33
+
 
34
+NEON-OBJS-$(CONFIG_FFT)                += arm/fft_neon.o                \
 
35
+
 
36
+NEON-OBJS-$(CONFIG_MDCT)               += arm/mdct_neon.o               \
 
37
+
 
38
 NEON-OBJS-$(CONFIG_VP3_DECODER)        += arm/vp3dsp_neon.o
 
39
 
 
40
 OBJS-$(HAVE_NEON)                      += arm/dsputil_neon.o            \
 
41
diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S
 
42
new file mode 100644
 
43
index 0000000..6ed5789
 
44
--- /dev/null
 
45
+++ b/libavcodec/arm/fft_neon.S
 
46
@@ -0,0 +1,369 @@
 
47
+/*
 
48
+ * ARM NEON optimised FFT
 
49
+ *
 
50
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
 
51
+ * Copyright (c) 2009 Naotoshi Nojiri
 
52
+ *
 
53
+ * This file is part of FFmpeg.
 
54
+ *
 
55
+ * FFmpeg is free software; you can redistribute it and/or
 
56
+ * modify it under the terms of the GNU Lesser General Public
 
57
+ * License as published by the Free Software Foundation; either
 
58
+ * version 2.1 of the License, or (at your option) any later version.
 
59
+ *
 
60
+ * FFmpeg is distributed in the hope that it will be useful,
 
61
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
62
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 
63
+ * Lesser General Public License for more details.
 
64
+ *
 
65
+ * You should have received a copy of the GNU Lesser General Public
 
66
+ * License along with FFmpeg; if not, write to the Free Software
 
67
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 
68
+ */
 
69
+
 
70
+#include "asm.S"
 
71
+
 
72
+#define M_SQRT1_2 0.70710678118654752440
 
73
+
 
74
+        .text
 
75
+
 
76
+function fft4_neon
 
77
+        vld1.32         {d0-d3}, [r0,:128]
 
78
+
 
79
+        vext.32         q8,  q1,  q1,  #1       @ i2,r3 d3=i3,r2
 
80
+        vsub.f32        d6,  d0,  d1            @ r0-r1,i0-i1
 
81
+        vsub.f32        d7,  d16, d17           @ r3-r2,i2-i3
 
82
+        vadd.f32        d4,  d0,  d1            @ r0+r1,i0+i1
 
83
+        vadd.f32        d5,  d2,  d3            @ i2+i3,r2+r3
 
84
+        vadd.f32        d1,  d6,  d7
 
85
+        vsub.f32        d3,  d6,  d7
 
86
+        vadd.f32        d0,  d4,  d5
 
87
+        vsub.f32        d2,  d4,  d5
 
88
+
 
89
+        vst1.32         {d0-d3}, [r0,:128]
 
90
+
 
91
+        bx              lr
 
92
+.endfunc
 
93
+
 
94
+function fft8_neon
 
95
+        mov             r1,  r0
 
96
+        vld1.32         {d0-d3},   [r1,:128]!
 
97
+        vld1.32         {d16-d19}, [r1,:128]
 
98
+
 
99
+        movw            r2,  #0x04f3            @ sqrt(1/2)
 
100
+        movt            r2,  #0x3f35
 
101
+        eor             r3,  r2,  #1<<31
 
102
+        vdup.32         d31, r2
 
103
+
 
104
+        vext.32         q11, q1,  q1,  #1       @ i2,r3,i3,r2
 
105
+        vadd.f32        d4,  d16, d17           @ r4+r5,i4+i5
 
106
+        vmov            d28, r3,  r2
 
107
+        vadd.f32        d5,  d18, d19           @ r6+r7,i6+i7
 
108
+        vsub.f32        d17, d16, d17           @ r4-r5,i4-i5
 
109
+        vsub.f32        d19, d18, d19           @ r6-r7,i6-i7
 
110
+        vrev64.32       d29, d28
 
111
+        vadd.f32        d20, d0,  d1            @ r0+r1,i0+i1
 
112
+        vadd.f32        d21, d2,  d3            @ r2+r3,i2+i3
 
113
+        vmul.f32        d26, d17, d28           @ -a2r*w,a2i*w
 
114
+        vext.32         q3,  q2,  q2,  #1
 
115
+        vmul.f32        d27, d19, d29           @ a3r*w,-a3i*w
 
116
+        vsub.f32        d23, d22, d23           @ i2-i3,r3-r2
 
117
+        vsub.f32        d22, d0,  d1            @ r0-r1,i0-i1
 
118
+        vmul.f32        d24, d17, d31           @ a2r*w,a2i*w
 
119
+        vmul.f32        d25, d19, d31           @ a3r*w,a3i*w
 
120
+        vadd.f32        d0,  d20, d21
 
121
+        vsub.f32        d2,  d20, d21
 
122
+        vadd.f32        d1,  d22, d23
 
123
+        vrev64.32       q13, q13
 
124
+        vsub.f32        d3,  d22, d23
 
125
+        vsub.f32        d6,  d6,  d7
 
126
+        vadd.f32        d24, d24, d26           @ a2r+a2i,a2i-a2r   t1,t2
 
127
+        vadd.f32        d25, d25, d27           @ a3r-a3i,a3i+a3r   t5,t6
 
128
+        vadd.f32        d7,  d4,  d5
 
129
+        vsub.f32        d18, d2,  d6
 
130
+        vext.32         q13, q12, q12, #1
 
131
+        vadd.f32        d2,  d2,  d6
 
132
+        vsub.f32        d16, d0,  d7
 
133
+        vadd.f32        d5,  d25, d24
 
134
+        vsub.f32        d4,  d26, d27
 
135
+        vadd.f32        d0,  d0,  d7
 
136
+        vsub.f32        d17, d1,  d5
 
137
+        vsub.f32        d19, d3,  d4
 
138
+        vadd.f32        d3,  d3,  d4
 
139
+        vadd.f32        d1,  d1,  d5
 
140
+
 
141
+        vst1.32         {d16-d19}, [r1,:128]
 
142
+        vst1.32         {d0-d3},   [r0,:128]
 
143
+
 
144
+        bx              lr
 
145
+.endfunc
 
146
+
 
147
+function fft16_neon
 
148
+        movrel          r1, mppm
 
149
+        vld1.32         {d16-d19}, [r0,:128]!   @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
 
150
+        pld             [r0, #32]
 
151
+        vld1.32         {d2-d3}, [r1,:128]
 
152
+        vext.32         q13, q9,  q9,  #1
 
153
+        vld1.32         {d22-d25}, [r0,:128]!   @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
 
154
+        vadd.f32        d4,  d16, d17
 
155
+        vsub.f32        d5,  d16, d17
 
156
+        vadd.f32        d18, d18, d19
 
157
+        vsub.f32        d19, d26, d27
 
158
+
 
159
+        vadd.f32        d20, d22, d23
 
160
+        vsub.f32        d22, d22, d23
 
161
+        vsub.f32        d23, d24, d25
 
162
+        vadd.f32        q8,  q2,  q9            @ {r0,i0,r1,i1}
 
163
+        vadd.f32        d21, d24, d25
 
164
+        vmul.f32        d24, d22, d2
 
165
+        vsub.f32        q9,  q2,  q9            @ {r2,i2,r3,i3}
 
166
+        vmul.f32        d25, d23, d3
 
167
+        vuzp.32         d16, d17                @ {r0,r1,i0,i1}
 
168
+        vmul.f32        q1,  q11, d2[1]
 
169
+        vuzp.32         d18, d19                @ {r2,r3,i2,i3}
 
170
+        vrev64.32       q12, q12
 
171
+        vadd.f32        q11, q12, q1            @ {t1a,t2a,t5,t6}
 
172
+        vld1.32         {d24-d27}, [r0,:128]!   @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
 
173
+        vzip.32         q10, q11
 
174
+        vld1.32         {d28-d31}, [r0,:128]    @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
 
175
+        vadd.f32        d0,  d22, d20
 
176
+        vadd.f32        d1,  d21, d23
 
177
+        vsub.f32        d2,  d21, d23
 
178
+        vsub.f32        d3,  d22, d20
 
179
+        sub             r0,  r0,  #96
 
180
+        vext.32         q13, q13, q13, #1
 
181
+        vsub.f32        q10, q8,  q0            @ {r4,r5,i4,i5}
 
182
+        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
 
183
+        vext.32         q15, q15, q15, #1
 
184
+        vsub.f32        q11, q9,  q1            @ {r6,r7,i6,i7}
 
185
+        vswp            d25, d26                @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
 
186
+        vadd.f32        q9,  q9,  q1            @ {r2,r3,i2,i3}
 
187
+        vswp            d29, d30                @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
 
188
+        vadd.f32        q0,  q12, q13           @ {t1,t2,t5,t6}
 
189
+        vadd.f32        q1,  q14, q15           @ {t1a,t2a,t5a,t6a}
 
190
+        movrel          r2,  ff_cos_16
 
191
+        vsub.f32        q13, q12, q13           @ {t3,t4,t7,t8}
 
192
+        vrev64.32       d1,  d1
 
193
+        vsub.f32        q15, q14, q15           @ {t3a,t4a,t7a,t8a}
 
194
+        vrev64.32       d3,  d3
 
195
+        movrel          r3,  pmmp
 
196
+        vswp            d1,  d26                @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
 
197
+        vswp            d3,  d30                @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
 
198
+        vadd.f32        q12, q0,  q13           @ {r8,i8,r9,i9}
 
199
+        vadd.f32        q14, q1,  q15           @ {r12,i12,r13,i13}
 
200
+        vld1.32         {d4-d5},  [r2,:64]
 
201
+        vsub.f32        q13, q0,  q13           @ {r10,i10,r11,i11}
 
202
+        vsub.f32        q15, q1,  q15           @ {r14,i14,r15,i15}
 
203
+        vswp            d25, d28                @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
 
204
+        vld1.32         {d6-d7},  [r3,:128]
 
205
+        vrev64.32       q1,  q14
 
206
+        vmul.f32        q14, q14, d4[1]
 
207
+        vmul.f32        q1,  q1,  q3
 
208
+        vmla.f32        q14, q1,  d5[1]         @ {t1a,t2a,t5a,t6a}
 
209
+        vswp            d27, d30                @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
 
210
+        vzip.32         q12, q14
 
211
+        vadd.f32        d0,  d28, d24
 
212
+        vadd.f32        d1,  d25, d29
 
213
+        vsub.f32        d2,  d25, d29
 
214
+        vsub.f32        d3,  d28, d24
 
215
+        vsub.f32        q12, q8,  q0            @ {r8,r9,i8,i9}
 
216
+        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
 
217
+        vsub.f32        q14, q10, q1            @ {r12,r13,i12,i13}
 
218
+        mov             r1,  #32
 
219
+        vadd.f32        q10, q10, q1            @ {r4,r5,i4,i5}
 
220
+        vrev64.32       q0,  q13
 
221
+        vmul.f32        q13, q13, d5[0]
 
222
+        vrev64.32       q1,  q15
 
223
+        vmul.f32        q15, q15, d5[1]
 
224
+        vst2.32         {d16-d17},[r0,:128], r1
 
225
+        vmul.f32        q0,  q0,  q3
 
226
+        vst2.32         {d20-d21},[r0,:128], r1
 
227
+        vmul.f32        q1,  q1,  q3
 
228
+        vmla.f32        q13, q0,  d5[0]         @ {t1,t2,t5,t6}
 
229
+        vmla.f32        q15, q1,  d4[1]         @ {t1a,t2a,t5a,t6a}
 
230
+        vst2.32         {d24-d25},[r0,:128], r1
 
231
+        vst2.32         {d28-d29},[r0,:128]
 
232
+        vzip.32         q13, q15
 
233
+        sub             r0, r0, #80
 
234
+        vadd.f32        d0,  d30, d26
 
235
+        vadd.f32        d1,  d27, d31
 
236
+        vsub.f32        d2,  d27, d31
 
237
+        vsub.f32        d3,  d30, d26
 
238
+        vsub.f32        q13, q9,  q0            @ {r10,r11,i10,i11}
 
239
+        vadd.f32        q9,  q9,  q0            @ {r2,r3,i2,i3}
 
240
+        vsub.f32        q15, q11, q1            @ {r14,r15,i14,i15}
 
241
+        vadd.f32        q11, q11, q1            @ {r6,r7,i6,i7}
 
242
+        vst2.32         {d18-d19},[r0,:128], r1
 
243
+        vst2.32         {d22-d23},[r0,:128], r1
 
244
+        vst2.32         {d26-d27},[r0,:128], r1
 
245
+        vst2.32         {d30-d31},[r0,:128]
 
246
+        bx              lr
 
247
+.endfunc
 
248
+
 
249
+function fft_pass_neon
 
250
+        push            {r4-r6,lr}
 
251
+        mov             r6,  r2                 @ n
 
252
+        lsl             r5,  r2,  #3            @ 2 * n * sizeof FFTSample
 
253
+        lsl             r4,  r2,  #4            @ 2 * n * sizeof FFTComplex
 
254
+        lsl             r2,  r2,  #5            @ 4 * n * sizeof FFTComplex
 
255
+        add             r3,  r2,  r4
 
256
+        add             r4,  r4,  r0            @ &z[o1]
 
257
+        add             r2,  r2,  r0            @ &z[o2]
 
258
+        add             r3,  r3,  r0            @ &z[o3]
 
259
+        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
 
260
+        movrel          r12, pmmp
 
261
+        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
 
262
+        add             r5,  r5,  r1            @ wim
 
263
+        vld1.32         {d6-d7},  [r12,:128]    @ pmmp
 
264
+        vswp            d21, d22
 
265
+        vld1.32         {d4},     [r1,:64]!     @ {wre[0],wre[1]}
 
266
+        sub             r5,  r5,  #4            @ wim--
 
267
+        vrev64.32       q1,  q11
 
268
+        vmul.f32        q11, q11, d4[1]
 
269
+        vmul.f32        q1,  q1,  q3
 
270
+        vld1.32         {d5[0]},  [r5,:32]      @ d5[0] = wim[-1]
 
271
+        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
 
272
+        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
 
273
+        sub             r6, r6, #1              @ n--
 
274
+        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
 
275
+        vzip.32         q10, q11
 
276
+        vadd.f32        d0,  d22, d20
 
277
+        vadd.f32        d1,  d21, d23
 
278
+        vsub.f32        d2,  d21, d23
 
279
+        vsub.f32        d3,  d22, d20
 
280
+        vsub.f32        q10, q8,  q0
 
281
+        vadd.f32        q8,  q8,  q0
 
282
+        vsub.f32        q11, q9,  q1
 
283
+        vadd.f32        q9,  q9,  q1
 
284
+        vst2.32         {d20-d21},[r2,:128]!    @ {z[o2],z[o2+1]}
 
285
+        vst2.32         {d16-d17},[r0,:128]!    @ {z[0],z[1]}
 
286
+        vst2.32         {d22-d23},[r3,:128]!    @ {z[o3],z[o3+1]}
 
287
+        vst2.32         {d18-d19},[r4,:128]!    @ {z[o1],z[o1+1]}
 
288
+        sub             r5,  r5,  #8            @ wim -= 2
 
289
+1:
 
290
+        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
 
291
+        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
 
292
+        vswp            d21, d22
 
293
+        vld1.32         {d4}, [r1]!             @ {wre[0],wre[1]}
 
294
+        vrev64.32       q0,  q10
 
295
+        vmul.f32        q10, q10, d4[0]
 
296
+        vrev64.32       q1,  q11
 
297
+        vmul.f32        q11, q11, d4[1]
 
298
+        vld1.32         {d5}, [r5]              @ {wim[-1],wim[0]}
 
299
+        vmul.f32        q0,  q0,  q3
 
300
+        sub             r5,  r5,  #8            @ wim -= 2
 
301
+        vmul.f32        q1,  q1,  q3
 
302
+        vmla.f32        q10, q0,  d5[1]         @ {t1,t2,t5,t6}
 
303
+        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
 
304
+        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
 
305
+        subs            r6,  r6,  #1            @ n--
 
306
+        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
 
307
+        vzip.32         q10, q11
 
308
+        vadd.f32        d0,  d22, d20
 
309
+        vadd.f32        d1,  d21, d23
 
310
+        vsub.f32        d2,  d21, d23
 
311
+        vsub.f32        d3,  d22, d20
 
312
+        vsub.f32        q10, q8,  q0
 
313
+        vadd.f32        q8,  q8,  q0
 
314
+        vsub.f32        q11, q9,  q1
 
315
+        vadd.f32        q9,  q9,  q1
 
316
+        vst2.32         {d20-d21}, [r2,:128]!   @ {z[o2],z[o2+1]}
 
317
+        vst2.32         {d16-d17}, [r0,:128]!   @ {z[0],z[1]}
 
318
+        vst2.32         {d22-d23}, [r3,:128]!   @ {z[o3],z[o3+1]}
 
319
+        vst2.32         {d18-d19}, [r4,:128]!   @ {z[o1],z[o1+1]}
 
320
+        bne             1b
 
321
+
 
322
+        pop             {r4-r6,pc}
 
323
+.endfunc
 
324
+
 
325
+.macro  def_fft n, n2, n4
 
326
+        .align 6
 
327
+function fft\n\()_neon
 
328
+        push            {r4, lr}
 
329
+        mov             r4,  r0
 
330
+        bl              fft\n2\()_neon
 
331
+        add             r0,  r4,  #\n4*2*8
 
332
+        bl              fft\n4\()_neon
 
333
+        add             r0,  r4,  #\n4*3*8
 
334
+        bl              fft\n4\()_neon
 
335
+        mov             r0,  r4
 
336
+        pop             {r4, lr}
 
337
+        movrel          r1,  ff_cos_\n
 
338
+        mov             r2,  #\n4/2
 
339
+        b               fft_pass_neon
 
340
+.endfunc
 
341
+.endm
 
342
+
 
343
+        def_fft    32,    16,     8
 
344
+        def_fft    64,    32,    16
 
345
+        def_fft   128,    64,    32
 
346
+        def_fft   256,   128,    64
 
347
+        def_fft   512,   256,   128
 
348
+        def_fft  1024,   512,   256
 
349
+        def_fft  2048,  1024,   512
 
350
+        def_fft  4096,  2048,  1024
 
351
+        def_fft  8192,  4096,  2048
 
352
+        def_fft 16384,  8192,  4096
 
353
+        def_fft 32768, 16384,  8192
 
354
+        def_fft 65536, 32768, 16384
 
355
+
 
356
+function ff_fft_calc_neon, export=1
 
357
+        ldr             r2,  [r0]
 
358
+        sub             r2,  r2,  #2
 
359
+        movrel          r3,  fft_tab_neon
 
360
+        ldr             r3,  [r3, r2, lsl #2]
 
361
+        mov             r0,  r1
 
362
+        bx              r3
 
363
+.endfunc
 
364
+
 
365
+function ff_fft_permute_neon, export=1
 
366
+        push            {r4,lr}
 
367
+        mov             r12, #1
 
368
+        ldr             r2,  [r0]       @ nbits
 
369
+        ldr             r3,  [r0, #20]  @ tmp_buf
 
370
+        ldr             r0,  [r0, #8]   @ revtab
 
371
+        lsl             r12, r12, r2
 
372
+        mov             r2,  r12
 
373
+1:
 
374
+        vld1.32         {d0-d1}, [r1,:128]!
 
375
+        ldr             r4,  [r0], #4
 
376
+        uxtah           lr,  r3,  r4
 
377
+        uxtah           r4,  r3,  r4,  ror #16
 
378
+        vst1.32         {d0}, [lr,:64]
 
379
+        vst1.32         {d1}, [r4,:64]
 
380
+        subs            r12, r12, #2
 
381
+        bgt             1b
 
382
+
 
383
+        sub             r1,  r1,  r2,  lsl #3
 
384
+1:
 
385
+        vld1.32         {d0-d3}, [r3,:128]!
 
386
+        vst1.32         {d0-d3}, [r1,:128]!
 
387
+        subs            r2,  r2,  #4
 
388
+        bgt             1b
 
389
+
 
390
+        pop             {r4,pc}
 
391
+.endfunc
 
392
+
 
393
+        .section .rodata
 
394
+        .align 4
 
395
+fft_tab_neon:
 
396
+        .word fft4_neon
 
397
+        .word fft8_neon
 
398
+        .word fft16_neon
 
399
+        .word fft32_neon
 
400
+        .word fft64_neon
 
401
+        .word fft128_neon
 
402
+        .word fft256_neon
 
403
+        .word fft512_neon
 
404
+        .word fft1024_neon
 
405
+        .word fft2048_neon
 
406
+        .word fft4096_neon
 
407
+        .word fft8192_neon
 
408
+        .word fft16384_neon
 
409
+        .word fft32768_neon
 
410
+        .word fft65536_neon
 
411
+        .size fft_tab_neon, . - fft_tab_neon
 
412
+
 
413
+        .align 4
 
414
+pmmp:   .float  +1.0, -1.0, -1.0, +1.0
 
415
+mppm:   .float  -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
 
416
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
 
417
new file mode 100644
 
418
index 0000000..6d1dcfd
 
419
--- /dev/null
 
420
+++ b/libavcodec/arm/mdct_neon.S
 
421
@@ -0,0 +1,178 @@
 
422
+/*
 
423
+ * ARM NEON optimised MDCT
 
424
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
 
425
+ *
 
426
+ * This file is part of FFmpeg.
 
427
+ *
 
428
+ * FFmpeg is free software; you can redistribute it and/or
 
429
+ * modify it under the terms of the GNU Lesser General Public
 
430
+ * License as published by the Free Software Foundation; either
 
431
+ * version 2.1 of the License, or (at your option) any later version.
 
432
+ *
 
433
+ * FFmpeg is distributed in the hope that it will be useful,
 
434
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
435
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 
436
+ * Lesser General Public License for more details.
 
437
+ *
 
438
+ * You should have received a copy of the GNU Lesser General Public
 
439
+ * License along with FFmpeg; if not, write to the Free Software
 
440
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 
441
+ */
 
442
+
 
443
+#include "asm.S"
 
444
+
 
445
+        .fpu neon
 
446
+        .text
 
447
+
 
448
+function ff_imdct_half_neon, export=1
 
449
+        push            {r4-r8,lr}
 
450
+
 
451
+        mov             r12, #1
 
452
+        ldr             lr,  [r0, #4]           @ nbits
 
453
+        ldr             r4,  [r0, #8]           @ tcos
 
454
+        ldr             r5,  [r0, #12]          @ tsin
 
455
+        ldr             r3,  [r0, #24]          @ revtab
 
456
+        lsl             r12, r12, lr            @ n  = 1 << nbits
 
457
+        lsr             lr,  r12, #2            @ n4 = n >> 2
 
458
+        add             r7,  r2,  r12,  lsl #1
 
459
+        mov             r12,  #-16
 
460
+        sub             r7,  r7,  #16
 
461
+
 
462
+        vld1.32         {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
 
463
+        vld1.32         {d0-d1},  [r2,:128]!    @ d0 =m0,x d1 =m1,x
 
464
+        vld1.32         {d2},     [r4,:64]!     @ d2=c0,c1
 
465
+        vld1.32         {d3},     [r5,:64]!     @ d3=s0,s1
 
466
+        vuzp.32         d17, d16
 
467
+        vuzp.32         d0,  d1
 
468
+        vmul.f32        d6,  d16, d2
 
469
+        vmul.f32        d7,  d0,  d2
 
470
+1:
 
471
+        subs            lr,  lr,  #2
 
472
+        ldr             r6,  [r3], #4
 
473
+        vmul.f32        d4,  d0,  d3
 
474
+        vmul.f32        d5,  d16, d3
 
475
+        vsub.f32        d4,  d6,  d4
 
476
+        vadd.f32        d5,  d5,  d7
 
477
+        uxtah           r8,  r1,  r6,  ror #16
 
478
+        uxtah           r6,  r1,  r6
 
479
+        beq             1f
 
480
+        vld1.32         {d16-d17},[r7,:128],r12
 
481
+        vld1.32         {d0-d1},  [r2,:128]!
 
482
+        vuzp.32         d17, d16
 
483
+        vld1.32         {d2},     [r4,:64]!
 
484
+        vuzp.32         d0,  d1
 
485
+        vmul.f32        d6,  d16, d2
 
486
+        vld1.32         {d3},     [r5,:64]!
 
487
+        vmul.f32        d7,  d0,  d2
 
488
+        vst2.32         {d4[0],d5[0]}, [r6,:64]
 
489
+        vst2.32         {d4[1],d5[1]}, [r8,:64]
 
490
+        b               1b
 
491
+1:
 
492
+        vst2.32         {d4[0],d5[0]}, [r6,:64]
 
493
+        vst2.32         {d4[1],d5[1]}, [r8,:64]
 
494
+
 
495
+        mov             r4,  r0
 
496
+        mov             r6,  r1
 
497
+        add             r0,  r0,  #16
 
498
+        bl              ff_fft_calc_neon
 
499
+
 
500
+        mov             r12, #1
 
501
+        ldr             lr,  [r4, #4]           @ nbits
 
502
+        ldr             r5,  [r4, #12]          @ tsin
 
503
+        ldr             r4,  [r4, #8]           @ tcos
 
504
+        lsl             r12, r12, lr            @ n  = 1 << nbits
 
505
+        lsr             lr,  r12, #3            @ n8 = n >> 3
 
506
+
 
507
+        add             r4,  r4,  lr,  lsl #2
 
508
+        add             r5,  r5,  lr,  lsl #2
 
509
+        add             r6,  r6,  lr,  lsl #3
 
510
+        sub             r1,  r4,  #8
 
511
+        sub             r2,  r5,  #8
 
512
+        sub             r3,  r6,  #16
 
513
+
 
514
+        mov             r7,  #-16
 
515
+        mov             r12, #-8
 
516
+        mov             r8,  r6
 
517
+        mov             r0,  r3
 
518
+
 
519
+        vld1.32         {d0-d1},  [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
 
520
+        vld1.32         {d20-d21},[r6,:128]!    @ d20=i2,r2 d21=i3,r3
 
521
+        vld1.32         {d18},    [r2,:64], r12 @ d18=s1,s0
 
522
+        vuzp.32         d20, d21
 
523
+        vuzp.32         d0,  d1
 
524
+1:
 
525
+        subs            lr,  lr,  #2
 
526
+        vmul.f32        d7,  d0,  d18
 
527
+        vld1.32         {d19},    [r5,:64]!     @ d19=s2,s3
 
528
+        vmul.f32        d4,  d1,  d18
 
529
+        vld1.32         {d16},    [r1,:64], r12 @ d16=c1,c0
 
530
+        vmul.f32        d5,  d21, d19
 
531
+        vld1.32         {d17},    [r4,:64]!     @ d17=c2,c3
 
532
+        vmul.f32        d6,  d20, d19
 
533
+        vmul.f32        d22, d1,  d16
 
534
+        vmul.f32        d23, d21, d17
 
535
+        vmul.f32        d24, d0,  d16
 
536
+        vmul.f32        d25, d20, d17
 
537
+        vadd.f32        d7,  d7,  d22
 
538
+        vadd.f32        d6,  d6,  d23
 
539
+        vsub.f32        d4,  d4,  d24
 
540
+        vsub.f32        d5,  d5,  d25
 
541
+        beq             1f
 
542
+        vld1.32         {d0-d1},  [r3,:128], r7
 
543
+        vld1.32         {d20-d21},[r6,:128]!
 
544
+        vld1.32         {d18},    [r2,:64], r12
 
545
+        vuzp.32         d20, d21
 
546
+        vuzp.32         d0,  d1
 
547
+        vrev64.32       q3,  q3
 
548
+        vtrn.32         d4,  d6
 
549
+        vtrn.32         d5,  d7
 
550
+        vswp            d5,  d6
 
551
+        vst1.32         {d4-d5},  [r0,:128], r7
 
552
+        vst1.32         {d6-d7},  [r8,:128]!
 
553
+        b               1b
 
554
+1:
 
555
+        vrev64.32       q3,  q3
 
556
+        vtrn.32         d4,  d6
 
557
+        vtrn.32         d5,  d7
 
558
+        vswp            d5,  d6
 
559
+        vst1.32         {d4-d5},  [r0,:128]
 
560
+        vst1.32         {d6-d7},  [r8,:128]
 
561
+
 
562
+        pop             {r4-r8,pc}
 
563
+.endfunc
 
564
+
 
565
+function ff_imdct_calc_neon, export=1
 
566
+        push            {r4-r6,lr}
 
567
+
 
568
+        ldr             r3,  [r0, #4]
 
569
+        mov             r4,  #1
 
570
+        mov             r5,  r1
 
571
+        lsl             r4,  r4,  r3
 
572
+        add             r1,  r1,  r4
 
573
+
 
574
+        bl              ff_imdct_half_neon
 
575
+
 
576
+        add             r0,  r5,  r4,  lsl #2
 
577
+        add             r1,  r5,  r4,  lsl #1
 
578
+        sub             r0,  r0,  #8
 
579
+        sub             r2,  r1,  #16
 
580
+        mov             r3,  #-16
 
581
+        mov             r6,  #-8
 
582
+        vmov.i32        d30, #1<<31
 
583
+1:
 
584
+        vld1.32         {d0-d1},  [r2,:128], r3
 
585
+        pld             [r0, #-16]
 
586
+        vrev64.32       q0,  q0
 
587
+        vld1.32         {d2-d3},  [r1,:128]!
 
588
+        veor            d4,  d1,  d30
 
589
+        pld             [r2, #-16]
 
590
+        vrev64.32       q1,  q1
 
591
+        veor            d5,  d0,  d30
 
592
+        vst1.32         {d2},     [r0,:64], r6
 
593
+        vst1.32         {d3},     [r0,:64], r6
 
594
+        vst1.32         {d4-d5},  [r5,:128]!
 
595
+        subs            r4,  r4,  #16
 
596
+        bgt             1b
 
597
+
 
598
+        pop             {r4-r6,pc}
 
599
+.endfunc
 
600
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
 
601
index e9c2bfc..c4abd14 100644
 
602
--- a/libavcodec/dsputil.h
 
603
+++ b/libavcodec/dsputil.h
 
604
@@ -692,11 +692,13 @@ extern FFTSample* ff_cos_tabs[13];
 
605
 int ff_fft_init(FFTContext *s, int nbits, int inverse);
 
606
 void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
 
607
 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
 
608
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 
609
 void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
 
610
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 
611
 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
 
612
 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
 
613
 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
 
614
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
615
 
 
616
 /**
 
617
  * Do the permutation needed BEFORE calling ff_fft_calc().
 
618
@@ -766,6 +768,8 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *inpu
 
619
 void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
 
620
 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
 
621
 void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
 
622
+void ff_imdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
 
623
+void ff_imdct_half_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
 
624
 void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input);
 
625
 void ff_mdct_end(MDCTContext *s);
 
626
 
 
627
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
 
628
index 296f634..b05ddda 100644
 
629
--- a/libavcodec/fft.c
 
630
+++ b/libavcodec/fft.c
 
631
@@ -64,6 +64,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 
632
     float alpha, c1, s1, s2;
 
633
     int split_radix = 1;
 
634
     int av_unused has_vectors;
 
635
+    int revtab_shift = 0;
 
636
 
 
637
     if (nbits < 2 || nbits > 16)
 
638
         goto fail;
 
639
@@ -112,6 +113,12 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 
640
         s->fft_calc = ff_fft_calc_altivec;
 
641
         split_radix = 0;
 
642
     }
 
643
+#elif HAVE_NEON
 
644
+    s->fft_permute = ff_fft_permute_neon;
 
645
+    s->fft_calc    = ff_fft_calc_neon;
 
646
+    s->imdct_calc  = ff_imdct_calc_neon;
 
647
+    s->imdct_half  = ff_imdct_half_neon;
 
648
+    revtab_shift = 3;
 
649
 #endif
 
650
 
 
651
     if (split_radix) {
 
652
@@ -125,7 +132,8 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 
653
                 tab[m/2-i] = tab[i];
 
654
         }
 
655
         for(i=0; i<n; i++)
 
656
-            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
 
657
+            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] =
 
658
+                i << revtab_shift;
 
659
         s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
 
660
     } else {
 
661
         int np, nblocks, np2, l;
 
662
-- 
 
663
1.6.3.3
 
664