2
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
4
* This file is part of Libav.
6
* Libav is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
11
* Libav is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with Libav; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
.macro bflies d0, d1, r0, r1
24
vrev64.32 \r0, \d1 @ t5, t6, t1, t2
25
vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
26
vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
27
vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
28
vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
30
vhsub.s16 \d1, \d0, \r0
31
vhadd.s16 \d0, \d0, \r0
34
.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
36
vmull.s16 \w0, \d3, \c0
37
vmlal.s16 \w0, \r0, \c1
38
vshrn.s32 \d3, \w0, #15
39
bflies \q0, \q1, \w0, \w1
42
.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
46
vmull.s16 \w0, \d1, \c0
47
vmlal.s16 \w0, \r0, \c1
48
vmull.s16 \w1, \d3, \c2
49
vmlal.s16 \w1, \r1, \c3
50
vshrn.s32 \d1, \w0, #15
51
vshrn.s32 \d3, \w1, #15
52
bflies \q0, \q1, \w0, \w1
55
.macro fft4 d0, d1, r0, r1
56
vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
57
vhsub.s16 \r1, \d1, \d0
58
vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
59
vmov.i64 \d1, #0xffff<<32
61
vrev64.16 \r1, \r0 @ t7, t8, t4, t3
62
vtrn.32 \r0, \r1 @ t3, t4, t7, t8
63
vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
64
vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
65
vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
68
.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
69
fft4 \d0, \d1, \r0, \r1
70
vtrn.32 \d0, \d1 @ z0, z2, z1, z3
71
vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
72
vhsub.s16 \d3, \d2, \d3 @ z5, z7
74
transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
78
vld1.16 {d0-d1}, [r0,:128]
80
vst1.16 {d0-d1}, [r0,:128]
85
vld1.16 {d0-d3}, [r0,:128]
87
vld1.16 {d30}, [r1,:64]
89
fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
92
vst1.16 {d0-d3}, [r0,:128]
97
vld1.16 {d0-d3}, [r0,:128]!
98
vld1.16 {d4-d7}, [r0,:128]
101
vld1.16 {d28-d31},[r1,:128]
103
fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
107
vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
108
vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
111
transform01 q0, q2, d5, d31, d28, d20, q8, q9
114
transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
118
vst1.16 {d0-d3}, [r0,:128]!
119
vst1.16 {d4-d7}, [r0,:128]
123
function fft_pass_neon
125
movrel lr, coefs + 24
126
vld1.16 {d30}, [lr,:64]
129
add r3, r1, r2, lsl #2
133
vld1.16 {d27[]}, [r3,:16]
135
vld1.16 {q0}, [r4,:128], r12
136
vld1.16 {q1}, [r4,:128], r12
137
vld1.16 {q2}, [r4,:128], r12
138
vld1.16 {q3}, [r4,:128], r12
139
vld1.16 {d28}, [r1,:64]!
140
vld1.16 {d29}, [r3,:64], lr
146
vmul.s16 d27, d27, d31
147
transform01 q0, q2, d5, d25, d27, d20, q8, q9
152
vld1.16 {q0}, [r4,:128], r12
153
vld1.16 {q1}, [r4,:128], r12
154
vld1.16 {q2}, [r4,:128], r12
155
vld1.16 {q3}, [r4,:128], r12
156
vld1.16 {d28}, [r1,:64]!
157
vld1.16 {d29}, [r3,:64], lr
165
vmul.s16 q13, q13, q15
166
transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
175
vmul.s16 q13, q13, q15
176
transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
185
vst1.16 {q0}, [r4,:128], r12
186
vst1.16 {q1}, [r4,:128], r12
187
vst1.16 {q2}, [r4,:128], r12
188
vst1.16 {q3}, [r4,:128], r12
195
#define F_SQRT1_2 23170
196
#define F_COS_16_1 30274
197
#define F_COS_16_3 12540
200
.short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
201
.short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
202
.short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
206
.macro def_fft n, n2, n4
207
function fft\n\()_neon
217
movrel r1, X(ff_cos_\n\()_fixed)
227
def_fft 512, 256, 128
228
def_fft 1024, 512, 256
229
def_fft 2048, 1024, 512
230
def_fft 4096, 2048, 1024
231
def_fft 8192, 4096, 2048
232
def_fft 16384, 8192, 4096
233
def_fft 32768, 16384, 8192
234
def_fft 65536, 32768, 16384
236
function ff_fft_fixed_calc_neon, export=1
239
movrel r3, fft_fixed_tab_neon
240
ldr r3, [r3, r2, lsl #2]
245
const fft_fixed_tab_neon