2
* Copyright (c) 2003, 2007-11 Matteo Frigo
3
* Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Wed Jul 27 06:13:34 EDT 2011 */
24
#include "codelet-dft.h"
28
/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n1fv_20 -include n1f.h */
31
* This function contains 104 FP additions, 50 FP multiplications,
32
* (or, 58 additions, 4 multiplications, 46 fused multiply/add),
33
* 71 stack variables, 4 constants, and 40 memory accesses
37
static void n1fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
39
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
40
DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
41
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
42
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
49
for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) {
50
V TU, TI, TP, TX, TM, TW, TT, TF;
52
V T3, Tm, T1r, T13, Ta, TN, TH, TA, TG, Tt, Th, TO, T1u, T1C, T1n;
53
V T1a, T1m, T1h, T1x, T1D, TE, Ti;
56
T1 = LD(&(xi[0]), ivs, &(xi[0]));
57
T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
58
Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
59
Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
61
V T14, T6, T1c, Tw, Tn, T1f, Tz, T17, T9, To, Tq, T1b, Td, Tr, Te;
64
V Tx, Ty, T7, T8, Tb, Tc;
66
V T4, T5, Tu, Tv, T11, T12;
67
T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
68
T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
69
Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
70
Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
71
Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
80
Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
81
T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
82
T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
86
Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
87
Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
88
Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
93
To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
94
Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
97
Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
98
Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
99
Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
108
V T1d, T1v, T18, Ts, T1e, Tg, T16, T1s;
109
T1d = VSUB(T1b, T1c);
110
T1v = VADD(T1b, T1c);
115
T16 = VSUB(T14, T15);
116
T1s = VADD(T14, T15);
118
V T1t, T19, T1w, T1g;
119
T1t = VADD(T17, T18);
120
T19 = VSUB(T17, T18);
123
T1w = VADD(T1e, T1f);
124
T1g = VSUB(T1e, T1f);
127
T1u = VADD(T1s, T1t);
128
T1C = VSUB(T1s, T1t);
129
T1n = VSUB(T16, T19);
130
T1a = VADD(T16, T19);
131
T1m = VSUB(T1d, T1g);
132
T1h = VADD(T1d, T1g);
133
T1x = VADD(T1v, T1w);
134
T1D = VSUB(T1v, T1w);
142
V TL, T1k, T1A, Tj, TD, T1E, T1G, TK, TC, T1j, T1z, T1i, T1y, TB;
145
T1i = VADD(T1a, T1h);
146
T1k = VSUB(T1a, T1h);
147
T1y = VADD(T1u, T1x);
148
T1A = VSUB(T1u, T1x);
150
TD = VFNMS(LDK(KP250000000), Ti, T3);
151
T1E = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1D, T1C));
152
T1G = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1C, T1D));
153
TK = VFNMS(LDK(KP250000000), TB, Tm);
155
T1j = VFNMS(LDK(KP250000000), T1i, T13);
156
ST(&(xo[0]), VADD(T1r, T1y), ovs, &(xo[0]));
157
T1z = VFNMS(LDK(KP250000000), T1y, T1r);
158
ST(&(xo[WS(os, 10)]), VADD(T13, T1i), ovs, &(xo[0]));
160
V T1p, T1l, T1o, T1q, T1F, T1B;
161
TU = VFNMS(LDK(KP618033988), TG, TH);
162
TI = VFMA(LDK(KP618033988), TH, TG);
163
TP = VFMA(LDK(KP618033988), TO, TN);
164
TX = VFNMS(LDK(KP618033988), TN, TO);
165
ST(&(xo[WS(os, 15)]), VFMAI(TC, Tj), ovs, &(xo[WS(os, 1)]));
166
ST(&(xo[WS(os, 5)]), VFNMSI(TC, Tj), ovs, &(xo[WS(os, 1)]));
167
T1p = VFMA(LDK(KP559016994), T1k, T1j);
168
T1l = VFNMS(LDK(KP559016994), T1k, T1j);
169
T1o = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1n, T1m));
170
T1q = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1m, T1n));
171
T1F = VFNMS(LDK(KP559016994), T1A, T1z);
172
T1B = VFMA(LDK(KP559016994), T1A, T1z);
173
ST(&(xo[WS(os, 14)]), VFMAI(T1q, T1p), ovs, &(xo[0]));
174
ST(&(xo[WS(os, 6)]), VFNMSI(T1q, T1p), ovs, &(xo[0]));
175
ST(&(xo[WS(os, 18)]), VFNMSI(T1o, T1l), ovs, &(xo[0]));
176
ST(&(xo[WS(os, 2)]), VFMAI(T1o, T1l), ovs, &(xo[0]));
177
ST(&(xo[WS(os, 16)]), VFNMSI(T1E, T1B), ovs, &(xo[0]));
178
ST(&(xo[WS(os, 4)]), VFMAI(T1E, T1B), ovs, &(xo[0]));
179
ST(&(xo[WS(os, 12)]), VFMAI(T1G, T1F), ovs, &(xo[0]));
180
ST(&(xo[WS(os, 8)]), VFNMSI(T1G, T1F), ovs, &(xo[0]));
181
TM = VFNMS(LDK(KP559016994), TL, TK);
182
TW = VFMA(LDK(KP559016994), TL, TK);
183
TT = VFNMS(LDK(KP559016994), TE, TD);
184
TF = VFMA(LDK(KP559016994), TE, TD);
189
V T10, TY, TQ, TS, TJ, TR, TZ, TV;
190
T10 = VFMA(LDK(KP951056516), TX, TW);
191
TY = VFNMS(LDK(KP951056516), TX, TW);
192
TQ = VFMA(LDK(KP951056516), TP, TM);
193
TS = VFNMS(LDK(KP951056516), TP, TM);
194
TJ = VFMA(LDK(KP951056516), TI, TF);
195
TR = VFNMS(LDK(KP951056516), TI, TF);
196
TZ = VFMA(LDK(KP951056516), TU, TT);
197
TV = VFNMS(LDK(KP951056516), TU, TT);
198
ST(&(xo[WS(os, 11)]), VFMAI(TS, TR), ovs, &(xo[WS(os, 1)]));
199
ST(&(xo[WS(os, 9)]), VFNMSI(TS, TR), ovs, &(xo[WS(os, 1)]));
200
ST(&(xo[WS(os, 19)]), VFMAI(TQ, TJ), ovs, &(xo[WS(os, 1)]));
201
ST(&(xo[WS(os, 1)]), VFNMSI(TQ, TJ), ovs, &(xo[WS(os, 1)]));
202
ST(&(xo[WS(os, 3)]), VFMAI(TY, TV), ovs, &(xo[WS(os, 1)]));
203
ST(&(xo[WS(os, 17)]), VFNMSI(TY, TV), ovs, &(xo[WS(os, 1)]));
204
ST(&(xo[WS(os, 7)]), VFMAI(T10, TZ), ovs, &(xo[WS(os, 1)]));
205
ST(&(xo[WS(os, 13)]), VFNMSI(T10, TZ), ovs, &(xo[WS(os, 1)]));
212
static const kdft_desc desc = { 20, XSIMD_STRING("n1fv_20"), {58, 4, 46, 0}, &GENUS, 0, 0, 0, 0 };
214
void XSIMD(codelet_n1fv_20) (planner *p) {
215
X(kdft_register) (p, n1fv_20, &desc);
220
/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name n1fv_20 -include n1f.h */
223
* This function contains 104 FP additions, 24 FP multiplications,
224
* (or, 92 additions, 12 multiplications, 12 fused multiply/add),
225
* 53 stack variables, 4 constants, and 40 memory accesses
229
static void n1fv_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
231
DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
232
DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
233
DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
234
DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
241
for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) {
242
V T3, T1B, Tm, T1i, TG, TN, TO, TH, T13, T16, T1k, T1u, T1v, T1z, T1r;
243
V T1s, T1y, T1a, T1d, T1j, Ti, TD, TB, TL, Tj, TC;
245
V T1, T2, T1g, Tk, Tl, T1h;
246
T1 = LD(&(xi[0]), ivs, &(xi[0]));
247
T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
249
Tk = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
250
Tl = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
253
T1B = VADD(T1g, T1h);
255
T1i = VSUB(T1g, T1h);
258
V T6, T18, Tw, T12, Tz, T15, T9, T1b, Td, T11, Tp, T19, Ts, T1c, Tg;
262
T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
263
T5 = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
266
Tu = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
267
Tv = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
273
Tx = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
274
Ty = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
277
T7 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
278
T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
284
Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
285
Tc = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
288
Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
289
To = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
295
Tq = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
296
Tr = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
299
Te = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
300
Tf = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
308
T13 = VSUB(T11, T12);
309
T16 = VSUB(T14, T15);
310
T1k = VADD(T13, T16);
311
T1u = VADD(T11, T12);
312
T1v = VADD(T14, T15);
313
T1z = VADD(T1u, T1v);
314
T1r = VADD(T18, T19);
315
T1s = VADD(T1b, T1c);
316
T1y = VADD(T1r, T1s);
317
T1a = VSUB(T18, T19);
318
T1d = VSUB(T1b, T1c);
319
T1j = VADD(T1a, T1d);
325
TD = VMUL(LDK(KP559016994), VSUB(Ta, Th));
329
TL = VMUL(LDK(KP559016994), VSUB(TA, Tt));
333
TC = VBYI(VADD(Tm, TB));
334
ST(&(xo[WS(os, 5)]), VSUB(Tj, TC), ovs, &(xo[WS(os, 1)]));
335
ST(&(xo[WS(os, 15)]), VADD(Tj, TC), ovs, &(xo[WS(os, 1)]));
337
V T1A, T1C, T1D, T1x, T1G, T1t, T1w, T1F, T1E;
338
T1A = VMUL(LDK(KP559016994), VSUB(T1y, T1z));
339
T1C = VADD(T1y, T1z);
340
T1D = VFNMS(LDK(KP250000000), T1C, T1B);
341
T1t = VSUB(T1r, T1s);
342
T1w = VSUB(T1u, T1v);
343
T1x = VBYI(VFMA(LDK(KP951056516), T1t, VMUL(LDK(KP587785252), T1w)));
344
T1G = VBYI(VFNMS(LDK(KP587785252), T1t, VMUL(LDK(KP951056516), T1w)));
345
ST(&(xo[0]), VADD(T1B, T1C), ovs, &(xo[0]));
346
T1F = VSUB(T1D, T1A);
347
ST(&(xo[WS(os, 8)]), VSUB(T1F, T1G), ovs, &(xo[0]));
348
ST(&(xo[WS(os, 12)]), VADD(T1G, T1F), ovs, &(xo[0]));
349
T1E = VADD(T1A, T1D);
350
ST(&(xo[WS(os, 4)]), VADD(T1x, T1E), ovs, &(xo[0]));
351
ST(&(xo[WS(os, 16)]), VSUB(T1E, T1x), ovs, &(xo[0]));
354
V T1n, T1l, T1m, T1f, T1q, T17, T1e, T1p, T1o;
355
T1n = VMUL(LDK(KP559016994), VSUB(T1j, T1k));
356
T1l = VADD(T1j, T1k);
357
T1m = VFNMS(LDK(KP250000000), T1l, T1i);
358
T17 = VSUB(T13, T16);
359
T1e = VSUB(T1a, T1d);
360
T1f = VBYI(VFNMS(LDK(KP587785252), T1e, VMUL(LDK(KP951056516), T17)));
361
T1q = VBYI(VFMA(LDK(KP951056516), T1e, VMUL(LDK(KP587785252), T17)));
362
ST(&(xo[WS(os, 10)]), VADD(T1i, T1l), ovs, &(xo[0]));
363
T1p = VADD(T1n, T1m);
364
ST(&(xo[WS(os, 6)]), VSUB(T1p, T1q), ovs, &(xo[0]));
365
ST(&(xo[WS(os, 14)]), VADD(T1q, T1p), ovs, &(xo[0]));
366
T1o = VSUB(T1m, T1n);
367
ST(&(xo[WS(os, 2)]), VADD(T1f, T1o), ovs, &(xo[0]));
368
ST(&(xo[WS(os, 18)]), VSUB(T1o, T1f), ovs, &(xo[0]));
371
V TI, TP, TX, TU, TM, TW, TF, TT, TK, TE;
372
TI = VFMA(LDK(KP951056516), TG, VMUL(LDK(KP587785252), TH));
373
TP = VFMA(LDK(KP951056516), TN, VMUL(LDK(KP587785252), TO));
374
TX = VFNMS(LDK(KP587785252), TN, VMUL(LDK(KP951056516), TO));
375
TU = VFNMS(LDK(KP587785252), TG, VMUL(LDK(KP951056516), TH));
376
TK = VFMS(LDK(KP250000000), TB, Tm);
379
TE = VFNMS(LDK(KP250000000), Ti, T3);
385
TQ = VBYI(VSUB(TM, TP));
386
ST(&(xo[WS(os, 19)]), VSUB(TJ, TQ), ovs, &(xo[WS(os, 1)]));
387
ST(&(xo[WS(os, 1)]), VADD(TJ, TQ), ovs, &(xo[WS(os, 1)]));
389
T10 = VBYI(VADD(TX, TW));
390
ST(&(xo[WS(os, 13)]), VSUB(TZ, T10), ovs, &(xo[WS(os, 1)]));
391
ST(&(xo[WS(os, 7)]), VADD(TZ, T10), ovs, &(xo[WS(os, 1)]));
396
TS = VBYI(VADD(TP, TM));
397
ST(&(xo[WS(os, 11)]), VSUB(TR, TS), ovs, &(xo[WS(os, 1)]));
398
ST(&(xo[WS(os, 9)]), VADD(TR, TS), ovs, &(xo[WS(os, 1)]));
400
TY = VBYI(VSUB(TW, TX));
401
ST(&(xo[WS(os, 17)]), VSUB(TV, TY), ovs, &(xo[WS(os, 1)]));
402
ST(&(xo[WS(os, 3)]), VADD(TV, TY), ovs, &(xo[WS(os, 1)]));
410
static const kdft_desc desc = { 20, XSIMD_STRING("n1fv_20"), {92, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
412
void XSIMD(codelet_n1fv_20) (planner *p) {
413
X(kdft_register) (p, n1fv_20, &desc);
416
#endif /* HAVE_FMA */