2
* ARM NEON optimised MDCT
3
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
5
* This file is part of FFmpeg.
7
* FFmpeg is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public
9
* License as published by the Free Software Foundation; either
10
* version 2.1 of the License, or (at your option) any later version.
12
* FFmpeg is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Lesser General Public License for more details.
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with FFmpeg; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28
#define ff_fft_calc_neon X(ff_fft_calc_neon)
30
function ff_imdct_half_neon, export=1
34
ldr lr, [r0, #28] @ mdct_bits
35
ldr r4, [r0, #32] @ tcos
36
ldr r3, [r0, #8] @ revtab
37
lsl r12, r12, lr @ n = 1 << nbits
38
lsr lr, r12, #2 @ n4 = n >> 2
39
add r7, r2, r12, lsl #1
43
vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
44
vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
46
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
58
add r8, r1, r8, lsl #3
59
add r6, r1, r6, lsl #3
61
vld2.32 {d16-d17},[r7,:128],r12
62
vld2.32 {d0-d1}, [r2,:128]!
64
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
67
vst2.32 {d4[0],d5[0]}, [r6,:64]
68
vst2.32 {d4[1],d5[1]}, [r8,:64]
71
vst2.32 {d4[0],d5[0]}, [r6,:64]
72
vst2.32 {d4[1],d5[1]}, [r8,:64]
79
ldr lr, [r4, #28] @ mdct_bits
80
ldr r4, [r4, #32] @ tcos
81
lsl r12, r12, lr @ n = 1 << nbits
82
lsr lr, r12, #3 @ n8 = n >> 3
84
add r4, r4, lr, lsl #3
85
add r6, r6, lr, lsl #3
93
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
94
vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
95
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
99
vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
101
vmul.f32 d5, d21, d19
102
vmul.f32 d6, d20, d19
103
vmul.f32 d22, d1, d16
104
vmul.f32 d23, d21, d17
105
vmul.f32 d24, d0, d16
106
vmul.f32 d25, d20, d17
112
vld2.32 {d0-d1}, [r3,:128], r7
113
vld2.32 {d20-d21},[r6,:128]!
114
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
116
vst2.32 {d4,d6}, [r0,:128], r7
117
vst2.32 {d5,d7}, [r8,:128]!
121
vst2.32 {d4,d6}, [r0,:128]
122
vst2.32 {d5,d7}, [r8,:128]
127
function ff_imdct_calc_neon, export=1
136
bl ff_imdct_half_neon
138
add r0, r5, r4, lsl #2
139
add r1, r5, r4, lsl #1
146
vld1.32 {d0-d1}, [r2,:128], r3
149
vld1.32 {d2-d3}, [r1,:128]!
154
vst1.32 {d2}, [r0,:64], r6
155
vst1.32 {d3}, [r0,:64], r6
156
vst1.32 {d4-d5}, [r5,:128]!
163
function ff_mdct_calc_neon, export=1
167
ldr lr, [r0, #28] @ mdct_bits
168
ldr r4, [r0, #32] @ tcos
169
ldr r3, [r0, #8] @ revtab
170
lsl lr, r12, lr @ n = 1 << nbits
171
add r7, r2, lr @ in4u
172
sub r9, r7, #16 @ in4d
173
add r2, r7, lr, lsl #1 @ in3u
174
add r8, r9, lr, lsl #1 @ in3d
175
add r5, r4, lr, lsl #1
180
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
181
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
182
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
183
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
184
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
185
vsub.f32 d0, d18, d0 @ in4d-in4u I
186
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
187
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
188
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
189
vadd.f32 d1, d1, d19 @ in3u+in3d -R
190
vsub.f32 d16, d16, d2 @ in0u-in2d R
191
vadd.f32 d17, d17, d3 @ in2u+in1d -I
193
vmul.f32 d7, d0, d21 @ I*s
194
ldr r10, [r3, lr, lsr #1]
195
vmul.f32 d6, d1, d20 @ -R*c
197
vmul.f32 d4, d1, d21 @ -R*s
198
vmul.f32 d5, d0, d20 @ I*c
199
vmul.f32 d24, d16, d30 @ R*c
200
vmul.f32 d25, d17, d31 @ -I*s
201
vmul.f32 d22, d16, d31 @ R*s
202
vmul.f32 d23, d17, d30 @ I*c
204
vsub.f32 d6, d6, d7 @ -R*c-I*s
205
vadd.f32 d7, d4, d5 @ -R*s+I*c
206
vsub.f32 d24, d25, d24 @ I*s-R*c
207
vadd.f32 d25, d22, d23 @ R*s-I*c
210
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
211
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
212
vneg.f32 d7, d7 @ R*s-I*c
213
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
214
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
215
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
216
vsub.f32 d0, d18, d0 @ in4d-in4u I
217
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
218
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
219
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
220
vadd.f32 d1, d1, d19 @ in3u+in3d -R
221
vsub.f32 d16, d16, d2 @ in0u-in2d R
222
vadd.f32 d17, d17, d3 @ in2u+in1d -I
223
uxth r12, r6, ror #16
225
add r12, r1, r12, lsl #3
226
add r6, r1, r6, lsl #3
227
vst2.32 {d6[0],d7[0]}, [r6,:64]
228
vst2.32 {d6[1],d7[1]}, [r12,:64]
229
uxth r6, r10, ror #16
231
add r6 , r1, r6, lsl #3
232
add r10, r1, r10, lsl #3
233
vst2.32 {d24[0],d25[0]},[r10,:64]
234
vst2.32 {d24[1],d25[1]},[r6,:64]
237
vneg.f32 d7, d7 @ R*s-I*c
238
uxth r12, r6, ror #16
240
add r12, r1, r12, lsl #3
241
add r6, r1, r6, lsl #3
242
vst2.32 {d6[0],d7[0]}, [r6,:64]
243
vst2.32 {d6[1],d7[1]}, [r12,:64]
244
uxth r6, r10, ror #16
246
add r6 , r1, r6, lsl #3
247
add r10, r1, r10, lsl #3
248
vst2.32 {d24[0],d25[0]},[r10,:64]
249
vst2.32 {d24[1],d25[1]},[r6,:64]
256
ldr lr, [r4, #28] @ mdct_bits
257
ldr r4, [r4, #32] @ tcos
258
lsl r12, r12, lr @ n = 1 << nbits
259
lsr lr, r12, #3 @ n8 = n >> 3
261
add r4, r4, lr, lsl #3
262
add r6, r6, lr, lsl #3
270
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
271
vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
272
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
275
vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
276
vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
277
vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
278
vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
279
vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
280
vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
281
vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
282
vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
283
vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
284
vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
285
vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
286
vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
287
vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
290
vld2.32 {d0-d1}, [r3,:128], r7
291
vld2.32 {d20-d21},[r6,:128]!
292
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
294
vst2.32 {d4,d6}, [r0,:128], r7
295
vst2.32 {d5,d7}, [r8,:128]!
299
vst2.32 {d4,d6}, [r0,:128]
300
vst2.32 {d5,d7}, [r8,:128]