2
* Copyright (c) 2003, 2006 Matteo Frigo
3
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Sat Jul 1 14:59:35 EDT 2006 */
24
#include "codelet-dft.h"
28
/* Generated by: ../../../genfft/gen_twiddle_c -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1bv_10 -include t1b.h -sign 1 */
31
* This function contains 51 FP additions, 40 FP multiplications,
32
* (or, 33 additions, 22 multiplications, 18 fused multiply/add),
33
* 43 stack variables, and 20 memory accesses
37
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
38
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
39
* $Id: gen_twiddle_c.ml,v 1.14 2006-02-12 23:34:12 athena Exp $
44
static const R *t1bv_10(R *ri, R *ii, const R *W, stride ios, INT m, INT dist)
46
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
47
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
48
DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
49
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
53
for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(ios)) {
54
V Td, TA, T4, Ta, Tk, TE, Tp, TF, TB, T9, T1, T2, Tb;
55
T1 = LD(&(x[0]), dist, &(x[0]));
56
T2 = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
59
Tg = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
60
Tn = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
61
Ti = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)]));
62
Tl = LD(&(x[WS(ios, 6)]), dist, &(x[0]));
65
T5 = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
66
Tc = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
68
V T3, Th, To, Tj, Tm, T7;
69
T7 = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)]));
70
T3 = BYTW(&(W[TWVL * 8]), T2);
71
Th = BYTW(&(W[TWVL * 6]), Tg);
72
To = BYTW(&(W[0]), Tn);
73
Tj = BYTW(&(W[TWVL * 16]), Ti);
74
Tm = BYTW(&(W[TWVL * 10]), Tl);
75
T6 = BYTW(&(W[TWVL * 2]), T5);
76
Td = BYTW(&(W[TWVL * 4]), Tc);
77
T8 = BYTW(&(W[TWVL * 12]), T7);
80
Ta = LD(&(x[WS(ios, 8)]), dist, &(x[0]));
90
Tb = BYTW(&(W[TWVL * 14]), Ta);
92
V TL, TG, Tw, Tq, TC, Te;
106
V TP, TN, TH, TJ, Tz, Tx, Tr, Tt, TI, Ts;
107
TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
108
TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
111
Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
112
Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
115
ST(&(x[0]), VADD(TA, TH), dist, &(x[0]));
116
TI = VFNMS(LDK(KP250000000), TH, TA);
117
ST(&(x[WS(ios, 5)]), VADD(T4, Tr), dist, &(x[WS(ios, 1)]));
118
Ts = VFNMS(LDK(KP250000000), Tr, T4);
121
TK = VFNMS(LDK(KP559016994), TJ, TI);
122
TO = VFMA(LDK(KP559016994), TJ, TI);
123
Tu = VFMA(LDK(KP559016994), Tt, Ts);
124
Ty = VFNMS(LDK(KP559016994), Tt, Ts);
125
ST(&(x[WS(ios, 8)]), VFMAI(TN, TK), dist, &(x[0]));
126
ST(&(x[WS(ios, 2)]), VFNMSI(TN, TK), dist, &(x[0]));
127
ST(&(x[WS(ios, 6)]), VFMAI(TP, TO), dist, &(x[0]));
128
ST(&(x[WS(ios, 4)]), VFNMSI(TP, TO), dist, &(x[0]));
129
ST(&(x[WS(ios, 9)]), VFNMSI(Tx, Tu), dist, &(x[WS(ios, 1)]));
130
ST(&(x[WS(ios, 1)]), VFMAI(Tx, Tu), dist, &(x[WS(ios, 1)]));
131
ST(&(x[WS(ios, 7)]), VFNMSI(Tz, Ty), dist, &(x[WS(ios, 1)]));
132
ST(&(x[WS(ios, 3)]), VFMAI(Tz, Ty), dist, &(x[WS(ios, 1)]));
141
static const tw_instr twinstr[] = {
154
static const ct_desc desc = { 10, "t1bv_10", twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
156
void X(codelet_t1bv_10) (planner *p) {
157
X(kdft_dit_register) (p, t1bv_10, &desc);
161
/* Generated by: ../../../genfft/gen_twiddle_c -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1bv_10 -include t1b.h -sign 1 */
164
* This function contains 51 FP additions, 30 FP multiplications,
165
* (or, 45 additions, 24 multiplications, 6 fused multiply/add),
166
* 32 stack variables, and 20 memory accesses
170
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
171
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
172
* $Id: gen_twiddle_c.ml,v 1.14 2006-02-12 23:34:12 athena Exp $
177
static const R *t1bv_10(R *ri, R *ii, const R *W, stride ios, INT m, INT dist)
179
DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
180
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
181
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
182
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
186
for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(ios)) {
187
V Tu, TH, Tg, Tl, Tp, TD, TE, TJ, T5, Ta, To, TA, TB, TI, Tr;
189
Tr = LD(&(x[0]), dist, &(x[0]));
190
Ts = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
191
Tt = BYTW(&(W[TWVL * 8]), Ts);
198
Tc = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
199
Td = BYTW(&(W[TWVL * 6]), Tc);
200
Tj = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
201
Tk = BYTW(&(W[0]), Tj);
202
Te = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)]));
203
Tf = BYTW(&(W[TWVL * 16]), Te);
204
Th = LD(&(x[WS(ios, 6)]), dist, &(x[0]));
205
Ti = BYTW(&(W[TWVL * 10]), Th);
218
T1 = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
219
T2 = BYTW(&(W[TWVL * 2]), T1);
220
T8 = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
221
T9 = BYTW(&(W[TWVL * 4]), T8);
222
T3 = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)]));
223
T4 = BYTW(&(W[TWVL * 12]), T3);
224
T6 = LD(&(x[WS(ios, 8)]), dist, &(x[0]));
225
T7 = BYTW(&(W[TWVL * 14]), T6);
235
V Tq, Tv, Tw, Tn, Tz, Tb, Tm, Ty, Tx;
236
Tq = VMUL(LDK(KP559016994), VSUB(To, Tp));
238
Tw = VFNMS(LDK(KP250000000), Tv, Tu);
241
Tn = VBYI(VFMA(LDK(KP951056516), Tb, VMUL(LDK(KP587785252), Tm)));
242
Tz = VBYI(VFNMS(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tb)));
243
ST(&(x[WS(ios, 5)]), VADD(Tu, Tv), dist, &(x[WS(ios, 1)]));
245
ST(&(x[WS(ios, 3)]), VSUB(Ty, Tz), dist, &(x[WS(ios, 1)]));
246
ST(&(x[WS(ios, 7)]), VADD(Tz, Ty), dist, &(x[WS(ios, 1)]));
248
ST(&(x[WS(ios, 1)]), VADD(Tn, Tx), dist, &(x[WS(ios, 1)]));
249
ST(&(x[WS(ios, 9)]), VSUB(Tx, Tn), dist, &(x[WS(ios, 1)]));
252
V TM, TK, TL, TG, TP, TC, TF, TO, TN;
253
TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
255
TL = VFNMS(LDK(KP250000000), TK, TH);
258
TG = VBYI(VFNMS(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
259
TP = VBYI(VFMA(LDK(KP951056516), TC, VMUL(LDK(KP587785252), TF)));
260
ST(&(x[0]), VADD(TH, TK), dist, &(x[0]));
262
ST(&(x[WS(ios, 4)]), VSUB(TO, TP), dist, &(x[0]));
263
ST(&(x[WS(ios, 6)]), VADD(TP, TO), dist, &(x[0]));
265
ST(&(x[WS(ios, 2)]), VADD(TG, TN), dist, &(x[0]));
266
ST(&(x[WS(ios, 8)]), VSUB(TN, TG), dist, &(x[0]));
272
static const tw_instr twinstr[] = {
285
static const ct_desc desc = { 10, "t1bv_10", twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
287
void X(codelet_t1bv_10) (planner *p) {
288
X(kdft_dit_register) (p, t1bv_10, &desc);
290
#endif /* HAVE_FMA */