1
;******************************************************************************
2
;* FFT transform with SSE/3DNow optimizations
3
;* Copyright (c) 2008 Loren Merritt
4
;* Copyright (c) 2011 Vitor Sessak
6
;* This algorithm (though not any of the implementation details) is
7
;* based on libdjbfft by D. J. Bernstein.
9
;* This file is part of Libav.
11
;* Libav is free software; you can redistribute it and/or
12
;* modify it under the terms of the GNU Lesser General Public
13
;* License as published by the Free Software Foundation; either
14
;* version 2.1 of the License, or (at your option) any later version.
16
;* Libav is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19
;* Lesser General Public License for more details.
21
;* You should have received a copy of the GNU Lesser General Public
22
;* License along with Libav; if not, write to the Free Software
23
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24
;******************************************************************************
26
; These functions are not individually interchangeable with the C versions.
27
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28
; in blocks as conventient to the vector size.
29
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
52
%define M_SQRT1_2 0.70710678118654752440
53
%define M_COS_PI_1_8 0.923879532511287
54
%define M_COS_PI_3_8 0.38268343236509
57
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
58
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
60
ps_root2: times 8 dd M_SQRT1_2
61
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
62
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
64
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
65
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
66
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
67
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
90
%macro T2_3DN 4 ; z0, z1, mem0, mem1
97
%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
100
pfadd %5, %4 ; {t6,t5}
101
pxor %3, [ps_m1p1] ; {t8,t7}
104
pfadd %1, %5 ; {r0,i0}
105
pfsub %6, %5 ; {r2,i2}
107
pfadd %2, %3 ; {r1,i1}
108
pfsub %4, %3 ; {r3,i3}
112
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
113
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
115
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
116
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
118
vsubps %5, %1, %2 ; v = %1 - %2
119
vaddps %3, %1, %2 ; w = %1 + %2
120
vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
121
vpermilps %2, %2, [perm1]
122
vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
123
vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
124
vsubps %4, %5, %1 ; s = r - q
125
vaddps %1, %5, %1 ; u = r + q
126
vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
127
vshufps %5, %4, %1, 0xbb
128
vshufps %3, %4, %1, 0xee
129
vperm2f128 %3, %3, %5, 0x13
130
vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
131
vshufps %2, %1, %4, 0xdd
132
vshufps %1, %1, %4, 0x88
133
vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
134
vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
136
vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
137
vsubps %2, %4, %1 ; %2 = v - w
138
vaddps %1, %4, %1 ; %1 = v + w
141
; In SSE mode do one fft4 transforms
142
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
143
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
145
; In AVX mode do two fft4 transforms
146
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
147
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
149
subps %3, %1, %2 ; {t3,t4,-t8,t7}
150
addps %1, %1, %2 ; {t1,t2,t6,t5}
151
xorps %3, %3, [ps_p1p1m1p1]
152
shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
153
shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
154
subps %3, %1, %2 ; {r2,i2,r3,i3}
155
addps %1, %1, %2 ; {r0,i0,r1,i1}
156
shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
157
shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
160
; In SSE mode do one FFT8
161
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
162
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
164
; In AVX mode do two FFT8
165
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
166
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
167
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
168
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
170
addps %6, %3, %4 ; {t1,t2,t3,t4}
171
subps %3, %3, %4 ; {r5,i5,r7,i7}
172
shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
173
mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
174
mulps %4, %4, [ps_root2]
175
addps %3, %3, %4 ; {t8,t7,ta,t9}
176
shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
177
shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
178
subps %3, %6, %4 ; {t6,t5,tc,tb}
179
addps %6, %6, %4 ; {t1,t2,t9,ta}
180
shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
181
shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
182
subps %3, %1, %6 ; {r4,r5,r6,r7}
183
addps %1, %1, %6 ; {r0,r1,r2,r3}
184
subps %4, %2, %5 ; {i4,i5,i6,i7}
185
addps %2, %2, %5 ; {i0,i1,i2,i3}
188
; scheduled for cpu-bound sizes
189
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
194
mulps m2, m4, m0 ; r2*wre
196
mulps m3, m5, m1 ; i2*wim
198
mulps m4, m4, m1 ; r2*wim
199
mulps m5, m5, m0 ; i2*wre
200
addps m2, m2, m3 ; r2*wre + i2*wim
201
mulps m3, m1, m7 ; i3*wim
202
subps m5, m5, m4 ; i2*wre - r2*wim
203
mulps m1, m1, m6 ; r3*wim
204
mulps m4, m0, m6 ; r3*wre
205
mulps m0, m0, m7 ; i3*wre
206
subps m4, m4, m3 ; r3*wre - i3*wim
208
addps m0, m0, m1 ; i3*wre + r3*wim
209
subps m1, m4, m2 ; t3
210
addps m4, m4, m2 ; t5
211
subps m3, m3, m4 ; r2
212
addps m4, m4, Z(0) ; r0
216
subps m3, m5, m0 ; t4
217
subps m4, m6, m3 ; r3
218
addps m3, m3, m6 ; r1
222
addps m3, m5, m0 ; t6
223
subps m2, m2, m1 ; i3
225
addps m1, m1, Z(3) ; i1
228
subps m4, m7, m3 ; i2
229
addps m3, m3, m7 ; i0
234
; scheduled to avoid store->load aliasing
235
%macro PASS_BIG 1 ; (!interleave)
239
mova m1, [wq+o1q] ; wim
240
mulps m2, m4, m0 ; r2*wre
242
mulps m3, m5, m1 ; i2*wim
244
mulps m4, m4, m1 ; r2*wim
245
mulps m5, m5, m0 ; i2*wre
246
addps m2, m2, m3 ; r2*wre + i2*wim
247
mulps m3, m1, m7 ; i3*wim
248
mulps m1, m1, m6 ; r3*wim
249
subps m5, m5, m4 ; i2*wre - r2*wim
250
mulps m4, m0, m6 ; r3*wre
251
mulps m0, m0, m7 ; i3*wre
252
subps m4, m4, m3 ; r3*wre - i3*wim
254
addps m0, m0, m1 ; i3*wre + r3*wim
255
subps m1, m4, m2 ; t3
256
addps m4, m4, m2 ; t5
257
subps m3, m3, m4 ; r2
258
addps m4, m4, Z(0) ; r0
262
subps m3, m5, m0 ; t4
263
subps m4, m6, m3 ; r3
264
addps m3, m3, m6 ; r1
268
addps m5, m5, m0 ; t6
269
subps m2, m2, m1 ; i3
271
addps m1, m1, Z(3) ; i1
274
subps m6, m7, m5 ; i2
275
addps m5, m5, m7 ; i0
279
INTERL m1, m3, m7, Z, 2
280
INTERL m2, m4, m0, Z2, 6
285
INTERL m5, m1, m3, Z, 0
286
INTERL m6, m2, m7, Z, 4
296
%define Z(x) [r0+mmsize*x]
297
%define Z2(x) [r0+mmsize*x]
298
%define ZH(x) [r0+mmsize*x+mmsize/2]
307
T8_AVX m0, m1, m2, m3, m4
321
T8_AVX m0, m1, m4, m5, m7
323
mova m4, [ps_cos16_1]
324
mova m5, [ps_cos16_2]
331
vblendps m2, m7, m3, 0xf0
332
vperm2f128 m3, m7, m3, 0x21
335
vperm2f128 m2, m2, m2, 0x01
340
vextractf128 Z(0), m0, 0
341
vextractf128 ZH(0), m1, 0
342
vextractf128 Z(1), m0, 1
343
vextractf128 ZH(1), m1, 1
344
vextractf128 Z(2), m5, 0
345
vextractf128 ZH(2), m3, 0
346
vextractf128 Z(3), m5, 1
347
vextractf128 ZH(3), m3, 1
362
T8_SSE m0, m1, m2, m3, m4, m6
363
; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
364
; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
366
vperm2f128 m4, m0, m2, 0x20
367
vperm2f128 m5, m1, m3, 0x20
368
vperm2f128 m6, m0, m2, 0x31
369
vperm2f128 m7, m1, m3, 0x31
371
PASS_SMALL 0, [cos_32], [cos_32+32]
375
fft32_interleave_avx:
383
vextractf128 Z(0), m0, 0
384
vextractf128 ZH(0), m1, 0
385
vextractf128 Z(1), m0, 1
386
vextractf128 ZH(1), m1, 1
394
%define movdqa movaps
413
T8_SSE m0, m1, m2, m3, m4, m5
427
T8_SSE m0, m1, m2, m3, m4, m5
438
PASS_SMALL 0, [cos_16], [cos_16+16]
447
T2_3DN m0, m1, Z(0), Z(1)
450
T4_3DN m0, m1, m2, m3, m4, m5
461
T2_3DN m0, m1, Z(0), Z(1)
464
T4_3DN m0, m1, m2, m3, m4, m5
467
T2_3DN m4, m5, Z(4), Z(5)
468
T2_3DN m6, m7, Z2(6), Z2(7)
477
T4_3DN m1, m3, m5, m7, m0, m2
482
T4_3DN m0, m2, m4, m6, m5, m7
514
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
515
%define Z2(x) [zq + o3q + mmsize*(x&1)]
516
%define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
517
%define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
519
%macro DECL_PASS 2+ ; name, payload
522
DEFINE_ARGS z, w, n, o1, o3
541
vextractf128 %4(%5), %2, 0
542
vextractf128 %4 %+ H(%5), %3, 0
543
vextractf128 %4(%5 + 1), %2, 1
544
vextractf128 %4 %+ H(%5 + 1), %3, 1
547
%define INTERL INTERL_AVX
549
DECL_PASS pass_avx, PASS_BIG 1
550
DECL_PASS pass_interleave_avx, PASS_BIG 0
563
%define INTERL INTERL_SSE
565
DECL_PASS pass_sse, PASS_BIG 1
566
DECL_PASS pass_interleave_sse, PASS_BIG 0
572
%define unpcklps punpckldq
573
%define unpckhps punpckhdq
574
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
575
DECL_PASS pass_interleave_3dn, PASS_BIG 0
576
%define pass_3dn2 pass_3dn
577
%define pass_interleave_3dn2 pass_interleave_3dn
580
%define SECTION_REL - $$
585
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
586
lea r2, [dispatch_tab%1]
587
mov r2, [r2 + (%2q-2)*gprsize]
593
%endmacro ; FFT_DISPATCH
595
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
596
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
598
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
601
%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
608
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
613
add r0, n*4 - (n&(-2<<%1))
615
add r0, n*2 - (n2&(-2<<%1))
617
sub r0, n*6 + (n2&(-2<<%1))
627
dispatch_tab%3%2: pointer list_of_fft
631
; On x86_32, this function does the register saving and restoring for all of fft.
632
; The others pass args in registers and don't spill anything.
633
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
634
FFT_DISPATCH %3%2, nbits
644
DECL_FFT 6, _avx, _interleave
648
DECL_FFT 5, _sse, _interleave
651
DECL_FFT 4, _3dn, _interleave
653
DECL_FFT 4, _3dn2, _interleave
662
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
663
movaps xmm0, [%3+%2*4]
664
movaps xmm1, [%3+%1*4-0x10]
666
shufps xmm0, xmm1, 0x88
667
shufps xmm1, xmm2, 0x77
668
movlps xmm4, [%4+%2*2]
669
movlps xmm5, [%5+%2*2+0x0]
670
movhps xmm4, [%4+%1*2-0x8]
671
movhps xmm5, [%5+%1*2-0x8]
685
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
686
mulps m6, %3, [%5+%1]
687
mulps m7, %2, [%5+%1]
688
mulps %2, %2, [%6+%1]
689
mulps %3, %3, [%6+%1]
694
%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
696
vmovaps ymm1, [%3+%1*2]
697
vmovaps ymm0, [%3+%1*2+0x20]
698
vmovaps ymm3, [%3+%2*2]
699
vmovaps ymm2, [%3+%2*2+0x20]
701
CMUL %1, ymm0, ymm1, %3, %4, %5
702
CMUL %2, ymm2, ymm3, %3, %4, %5
703
vshufps ymm1, ymm1, ymm1, 0x1b
704
vshufps ymm3, ymm3, ymm3, 0x1b
705
vperm2f128 ymm1, ymm1, ymm1, 0x01
706
vperm2f128 ymm3, ymm3, ymm3, 0x01
707
vunpcklps ymm6, ymm2, ymm1
708
vunpckhps ymm4, ymm2, ymm1
709
vunpcklps ymm7, ymm0, ymm3
710
vunpckhps ymm5, ymm0, ymm3
712
vextractf128 [%3+%1*2], ymm7, 0
713
vextractf128 [%3+%1*2+0x10], ymm5, 0
714
vextractf128 [%3+%1*2+0x20], ymm7, 1
715
vextractf128 [%3+%1*2+0x30], ymm5, 1
717
vextractf128 [%3+%2*2], ymm6, 0
718
vextractf128 [%3+%2*2+0x10], ymm4, 0
719
vextractf128 [%3+%2*2+0x20], ymm6, 1
720
vextractf128 [%3+%2*2+0x30], ymm4, 1
726
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
728
movaps xmm1, [%3+%1*2]
729
movaps xmm0, [%3+%1*2+0x10]
730
CMUL %1, xmm0, xmm1, %3, %4, %5
731
movaps xmm5, [%3+%2*2]
732
movaps xmm4, [%3+%2*2+0x10]
733
CMUL %2, xmm4, xmm5, %3, %4, %5
734
shufps xmm1, xmm1, 0x1b
735
shufps xmm5, xmm5, 0x1b
742
movaps [%3+%2*2], xmm6
743
movaps [%3+%2*2+0x10], xmm4
744
movaps [%3+%1*2], xmm0
745
movaps [%3+%1*2+0x10], xmm2
752
cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
765
mov r3d, [r0+FFTContext.mdctsize]
768
mov rtcos, [r0+FFTContext.tcos]
769
mov rtsin, [r0+FFTContext.tsin]
777
mov rrevtab, [r0+FFTContext.revtab]
797
PREROTATER r4, r3, r2, rtcos, rtsin
799
movzx r5, word [rrevtab+r4-4]
800
movzx r6, word [rrevtab+r4-2]
801
movzx r13, word [rrevtab+r3]
802
movzx r14, word [rrevtab+r3+2]
803
movlps [r1+r5 *8], xmm0
804
movhps [r1+r6 *8], xmm0
805
movlps [r1+r13*8], xmm1
806
movhps [r1+r14*8], xmm1
810
movzx r5, word [r6+r4-4]
811
movzx r4, word [r6+r4-2]
812
movlps [r1+r5*8], xmm0
813
movhps [r1+r4*8], xmm0
814
movzx r5, word [r6+r3]
815
movzx r4, word [r6+r3+2]
816
movlps [r1+r5*8], xmm1
817
movhps [r1+r4*8], xmm1
825
mov r1d, [r5+FFTContext.nbits]
829
mov r0d, [r5+FFTContext.mdctsize]
841
%2 r0, r1, r6, rtcos, rtsin
849
%ifidn avx_enabled, 1
855
DECL_IMDCT _sse, POSROTATESHUF
860
DECL_IMDCT _avx, POSROTATESHUF_AVX