2
* Copyright (c) 2003, 2007-11 Matteo Frigo
3
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Wed Jul 27 06:16:14 EDT 2011 */
24
#include "codelet-dft.h"
28
/* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 4 -name t2sv_4 -include ts.h */
31
* This function contains 24 FP additions, 16 FP multiplications,
32
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
33
* 37 stack variables, 0 constants, and 16 memory accesses
37
static void t2sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
41
for (m = mb, W = W + (mb * 4); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 4), MAKE_VOLATILE_STRIDE(rs)) {
42
V T2, T6, T3, T5, T1, Tx, T8, Tc, Tf, Ta, T4, Th, Tj, Tl;
44
T6 = LDW(&(W[TWVL * 3]));
45
T3 = LDW(&(W[TWVL * 2]));
46
T5 = LDW(&(W[TWVL * 1]));
47
T1 = LD(&(ri[0]), ms, &(ri[0]));
48
Tx = LD(&(ii[0]), ms, &(ii[0]));
49
T8 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
50
Tc = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
51
Tf = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
54
Th = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
55
Tj = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
56
Tl = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
58
V Tg, Tb, T7, Tp, Tk, Tr, Ti;
60
Tb = VFNMS(T5, T3, Ta);
61
T7 = VFMA(T5, T6, T4);
65
Ti = VFMA(T5, Th, Tg);
67
V Tv, T9, Tq, Tm, Ts, Tw, Td;
70
Tq = VFNMS(T5, Tf, Tp);
71
Tm = VFMA(T6, Tl, Tk);
72
Ts = VFNMS(T6, Tj, Tr);
73
Tw = VFNMS(Tb, T8, Tv);
74
Td = VFMA(Tb, Tc, T9);
87
ST(&(ii[WS(rs, 3)]), VADD(TA, Tz), ms, &(ii[WS(rs, 1)]));
88
ST(&(ii[WS(rs, 1)]), VSUB(Tz, TA), ms, &(ii[WS(rs, 1)]));
89
ST(&(ii[WS(rs, 2)]), VSUB(Ty, Tu), ms, &(ii[0]));
90
ST(&(ii[0]), VADD(Tu, Ty), ms, &(ii[0]));
91
ST(&(ri[WS(rs, 1)]), VADD(To, Tt), ms, &(ri[WS(rs, 1)]));
92
ST(&(ri[WS(rs, 3)]), VSUB(To, Tt), ms, &(ri[WS(rs, 1)]));
93
ST(&(ri[0]), VADD(Te, Tn), ms, &(ri[0]));
94
ST(&(ri[WS(rs, 2)]), VSUB(Te, Tn), ms, &(ri[0]));
104
static const tw_instr twinstr[] = {
107
{TW_NEXT, (2 * VL), 0}
110
static const ct_desc desc = { 4, XSIMD_STRING("t2sv_4"), twinstr, &GENUS, {16, 8, 8, 0}, 0, 0, 0 };
112
void XSIMD(codelet_t2sv_4) (planner *p) {
113
X(kdft_dit_register) (p, t2sv_4, &desc);
117
/* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 4 -name t2sv_4 -include ts.h */
120
* This function contains 24 FP additions, 16 FP multiplications,
121
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
122
* 21 stack variables, 0 constants, and 16 memory accesses
126
static void t2sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
130
for (m = mb, W = W + (mb * 4); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 4), MAKE_VOLATILE_STRIDE(rs)) {
131
V T2, T4, T3, T5, T6, T8;
133
T4 = LDW(&(W[TWVL * 1]));
134
T3 = LDW(&(W[TWVL * 2]));
135
T5 = LDW(&(W[TWVL * 3]));
136
T6 = VFMA(T2, T3, VMUL(T4, T5));
137
T8 = VFNMS(T4, T3, VMUL(T2, T5));
139
V T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
140
T1 = LD(&(ri[0]), ms, &(ri[0]));
141
Tp = LD(&(ii[0]), ms, &(ii[0]));
142
T7 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
143
T9 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
144
Ta = VFMA(T6, T7, VMUL(T8, T9));
145
To = VFNMS(T8, T7, VMUL(T6, T9));
148
Tc = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
149
Td = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
150
Te = VFMA(T2, Tc, VMUL(T4, Td));
151
Tk = VFNMS(T4, Tc, VMUL(T2, Td));
152
Tf = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
153
Tg = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
154
Th = VFMA(T3, Tf, VMUL(T5, Tg));
155
Tl = VFNMS(T5, Tf, VMUL(T3, Tg));
161
ST(&(ri[WS(rs, 2)]), VSUB(Tb, Ti), ms, &(ri[0]));
162
ST(&(ri[0]), VADD(Tb, Ti), ms, &(ri[0]));
165
ST(&(ii[0]), VADD(Tn, Tq), ms, &(ii[0]));
166
ST(&(ii[WS(rs, 2)]), VSUB(Tq, Tn), ms, &(ii[0]));
172
ST(&(ri[WS(rs, 3)]), VSUB(Tj, Tm), ms, &(ri[WS(rs, 1)]));
173
ST(&(ri[WS(rs, 1)]), VADD(Tj, Tm), ms, &(ri[WS(rs, 1)]));
176
ST(&(ii[WS(rs, 1)]), VSUB(Tr, Ts), ms, &(ii[WS(rs, 1)]));
177
ST(&(ii[WS(rs, 3)]), VADD(Ts, Tr), ms, &(ii[WS(rs, 1)]));
185
static const tw_instr twinstr[] = {
188
{TW_NEXT, (2 * VL), 0}
191
static const ct_desc desc = { 4, XSIMD_STRING("t2sv_4"), twinstr, &GENUS, {16, 8, 8, 0}, 0, 0, 0 };
193
void XSIMD(codelet_t2sv_4) (planner *p) {
194
X(kdft_dit_register) (p, t2sv_4, &desc);
196
#endif /* HAVE_FMA */