2
* Copyright (c) 2003, 2006 Matteo Frigo
3
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Sun Jul 2 15:50:22 EDT 2006 */
24
#include "codelet-rdft.h"
28
/* Generated by: ../../../genfft/gen_hc2hc -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 9 -dit -name hf_9 -include hf.h */
31
* This function contains 96 FP additions, 88 FP multiplications,
32
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
33
* 75 stack variables, and 36 memory accesses
37
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
38
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
39
* $Id: gen_hc2hc.ml,v 1.16 2006-02-12 23:34:12 athena Exp $
44
static const R *hf_9(R *rio, R *iio, const R *W, stride ios, INT m, INT dist)
46
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
47
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
48
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
49
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
50
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
51
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
52
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
53
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
54
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
55
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
57
for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 16, MAKE_VOLATILE_STRIDE(ios)) {
60
E T1, T1R, T10, T1Q, Te, T1W, Th, Tk, T1l, T1r, T1q, T1N, Ti, T1g, TT;
61
E T12, Tn, Tq, Tp, T17, Tx, T14, To, Tj;
63
E T9, Tc, TY, Ta, Tb, TX, T7, TZ, Td;
66
E T3, T6, T8, TW, T4, T2, T5;
68
T6 = iio[-WS(ios, 5)];
71
Tc = iio[-WS(ios, 2)];
79
TX = FNMS(T5, T3, TW);
82
T1R = iio[-WS(ios, 8)];
83
TZ = FNMS(Tb, T9, TY);
86
E TB, TE, T1n, TC, TH, TK, T1k, TR, TG, TJ, TD, TA;
88
TE = iio[-WS(ios, 6)];
95
E TN, TQ, TP, T1j, TO, TM;
104
TH = rio[WS(ios, 5)];
105
TK = iio[-WS(ios, 3)];
106
T1k = FNMS(TP, TN, T1j);
107
TR = FMA(TP, TQ, TO);
113
E T1o, TF, T1i, TL, T1h, TI, Tg, T1p, TS;
114
Th = rio[WS(ios, 1)];
117
T1o = FNMS(TD, TB, T1n);
118
TF = FMA(TD, TE, TC);
119
T1i = FNMS(TJ, TH, T1h);
120
TL = FMA(TJ, TK, TI);
121
Tk = iio[-WS(ios, 7)];
127
T1q = FNMS(KP500000000, T1p, T1o);
130
T1g = FNMS(KP500000000, TS, TF);
135
E Tt, Tw, Ts, Tv, T16, Tu, Tm;
136
Tt = rio[WS(ios, 7)];
137
Tw = iio[-WS(ios, 1)];
140
Tn = rio[WS(ios, 4)];
141
Tq = iio[-WS(ios, 4)];
146
T17 = FNMS(Tv, Tt, T16);
147
Tx = FMA(Tv, Tw, Tu);
155
E TV, Tf, T1d, T1a, T19, T1M, T1V, T1S, T1c, Tz, Tl, Ty;
156
TV = FNMS(KP500000000, Te, T1);
160
T15 = FNMS(Tp, Tn, T14);
161
Tr = FMA(Tp, Tq, To);
162
T13 = FNMS(Tj, Th, T12);
163
Tl = FMA(Tj, Tk, Ti);
168
T19 = FNMS(KP500000000, T18, T13);
170
T1V = FNMS(KP500000000, T1Q, T1R);
173
T1c = FNMS(KP500000000, Ty, Tl);
176
E T11, T1z, T1E, T1D, T21, T1X, T1I, T1C, T22, T1y, T24, T1u, T1U, TU;
180
E T1P, T1O, T1L, T1T;
183
T11 = FNMS(KP866025403, T10, TV);
184
T1z = FMA(KP866025403, T10, TV);
185
T1L = FNMS(KP500000000, TU, Tf);
187
T1T = FNMS(KP500000000, T1P, T1S);
189
rio[WS(ios, 3)] = FMA(KP866025403, T1O, T1L);
190
iio[-WS(ios, 6)] = FNMS(KP866025403, T1O, T1L);
191
iio[-WS(ios, 3)] = FMA(KP866025403, T1U, T1T);
192
rio[WS(ios, 6)] = FMS(KP866025403, T1U, T1T);
195
E T1B, T1m, T1w, T1f, T1s, T1A, T1b, T1e, T1x, T1t;
196
T1E = FNMS(KP866025403, T1a, T19);
197
T1b = FMA(KP866025403, T1a, T19);
198
T1e = FNMS(KP866025403, T1d, T1c);
199
T1D = FMA(KP866025403, T1d, T1c);
200
T1B = FMA(KP866025403, T1l, T1g);
201
T1m = FNMS(KP866025403, T1l, T1g);
202
T21 = FNMS(KP866025403, T1W, T1V);
203
T1X = FMA(KP866025403, T1W, T1V);
204
T1w = FNMS(KP176326980, T1b, T1e);
205
T1f = FMA(KP176326980, T1e, T1b);
206
T1s = FNMS(KP866025403, T1r, T1q);
207
T1A = FMA(KP866025403, T1r, T1q);
208
T1x = FMA(KP363970234, T1m, T1s);
209
T1t = FNMS(KP363970234, T1s, T1m);
210
T1I = FNMS(KP176326980, T1A, T1B);
211
T1C = FMA(KP176326980, T1B, T1A);
212
T22 = FMA(KP954188894, T1x, T1w);
213
T1y = FNMS(KP954188894, T1x, T1w);
214
T24 = FMA(KP954188894, T1t, T1f);
215
T1u = FNMS(KP954188894, T1t, T1f);
218
E T1Y, T1G, T23, T1F, T1J, T1v;
219
iio[-WS(ios, 2)] = FNMS(KP984807753, T22, T21);
220
T1v = FNMS(KP492403876, T1u, T11);
221
rio[WS(ios, 2)] = FMA(KP984807753, T1u, T11);
222
T1F = FMA(KP839099631, T1E, T1D);
223
T1J = FNMS(KP839099631, T1D, T1E);
224
iio[-WS(ios, 5)] = FNMS(KP852868531, T1y, T1v);
225
iio[-WS(ios, 8)] = FMA(KP852868531, T1y, T1v);
226
T1K = FMA(KP777861913, T1J, T1I);
227
T1Y = FNMS(KP777861913, T1J, T1I);
228
T1G = FMA(KP777861913, T1F, T1C);
229
T20 = FNMS(KP777861913, T1F, T1C);
230
T23 = FMA(KP492403876, T22, T21);
231
iio[-WS(ios, 1)] = FNMS(KP984807753, T1Y, T1X);
232
rio[WS(ios, 1)] = FMA(KP984807753, T1G, T1z);
233
T1H = FNMS(KP492403876, T1G, T1z);
234
rio[WS(ios, 8)] = -(FMA(KP852868531, T24, T23));
235
rio[WS(ios, 5)] = FMS(KP852868531, T24, T23);
236
T1Z = FMA(KP492403876, T1Y, T1X);
241
iio[-WS(ios, 7)] = FNMS(KP852868531, T1K, T1H);
242
rio[WS(ios, 4)] = FMA(KP852868531, T1K, T1H);
243
iio[-WS(ios, 4)] = FMA(KP852868531, T20, T1Z);
244
rio[WS(ios, 7)] = FMS(KP852868531, T20, T1Z);
249
static const tw_instr twinstr[] = {
254
static const hc2hc_desc desc = { 9, "hf_9", twinstr, &GENUS, {24, 16, 72, 0}, 0, 0, 0 };
256
void X(codelet_hf_9) (planner *p) {
257
X(khc2hc_register) (p, hf_9, &desc);
261
/* Generated by: ../../../genfft/gen_hc2hc -compact -variables 4 -pipeline-latency 4 -n 9 -dit -name hf_9 -include hf.h */
264
* This function contains 96 FP additions, 72 FP multiplications,
265
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
266
* 41 stack variables, and 36 memory accesses
270
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
271
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
272
* $Id: gen_hc2hc.ml,v 1.16 2006-02-12 23:34:12 athena Exp $
277
static const R *hf_9(R *rio, R *iio, const R *W, stride ios, INT m, INT dist)
279
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
280
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
281
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
282
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
283
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
284
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
285
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
286
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
288
for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 16, MAKE_VOLATILE_STRIDE(ios)) {
289
E T1, T1B, TQ, T1G, Tc, TN, T1A, T1H, TL, T1x, T17, T1o, T1c, T1n, Tu;
290
E T1w, TW, T1k, T11, T1l;
294
T1B = iio[-WS(ios, 8)];
297
T3 = rio[WS(ios, 3)];
298
T5 = iio[-WS(ios, 5)];
301
T6 = FMA(T2, T3, T4 * T5);
302
TO = FNMS(T4, T3, T2 * T5);
306
T8 = rio[WS(ios, 6)];
307
Ta = iio[-WS(ios, 2)];
310
Tb = FMA(T7, T8, T9 * Ta);
311
TP = FNMS(T9, T8, T7 * Ta);
313
TQ = KP866025403 * (TO - TP);
314
T1G = KP866025403 * (Tb - T6);
316
TN = FNMS(KP500000000, Tc, T1);
318
T1H = FNMS(KP500000000, T1A, T1B);
321
E Tz, T19, TE, T14, TJ, T15, TK, T1a;
324
Tw = rio[WS(ios, 2)];
325
Ty = iio[-WS(ios, 6)];
328
Tz = FMA(Tv, Tw, Tx * Ty);
329
T19 = FNMS(Tx, Tw, Tv * Ty);
333
TB = rio[WS(ios, 5)];
334
TD = iio[-WS(ios, 3)];
337
TE = FMA(TA, TB, TC * TD);
338
T14 = FNMS(TC, TB, TA * TD);
342
TG = rio[WS(ios, 8)];
346
TJ = FMA(TF, TG, TH * TI);
347
T15 = FNMS(TH, TG, TF * TI);
354
E T13, T16, T18, T1b;
355
T13 = FNMS(KP500000000, TK, Tz);
356
T16 = KP866025403 * (T14 - T15);
359
T18 = KP866025403 * (TJ - TE);
360
T1b = FNMS(KP500000000, T1a, T19);
366
E Ti, TY, Tn, TT, Ts, TU, Tt, TZ;
369
Tf = rio[WS(ios, 1)];
370
Th = iio[-WS(ios, 7)];
373
Ti = FMA(Te, Tf, Tg * Th);
374
TY = FNMS(Tg, Tf, Te * Th);
378
Tk = rio[WS(ios, 4)];
379
Tm = iio[-WS(ios, 4)];
382
Tn = FMA(Tj, Tk, Tl * Tm);
383
TT = FNMS(Tl, Tk, Tj * Tm);
387
Tp = rio[WS(ios, 7)];
388
Tr = iio[-WS(ios, 1)];
391
Ts = FMA(To, Tp, Tq * Tr);
392
TU = FNMS(Tq, Tp, To * Tr);
400
TS = FNMS(KP500000000, Tt, Ti);
401
TV = KP866025403 * (TT - TU);
404
TX = KP866025403 * (Ts - Tn);
405
T10 = FNMS(KP500000000, TZ, TY);
412
T1y = KP866025403 * (T1w - T1x);
415
T1v = FNMS(KP500000000, TM, Td);
417
rio[WS(ios, 3)] = T1v + T1y;
418
iio[-WS(ios, 6)] = T1v - T1y;
421
E T1D, T1z, T1C, T1E;
422
T1D = KP866025403 * (TL - Tu);
425
T1E = FNMS(KP500000000, T1z, T1C);
427
iio[-WS(ios, 3)] = T1D + T1E;
428
rio[WS(ios, 6)] = T1D - T1E;
431
E TR, T1I, T1e, T1J, T1i, T1F, T1f, T1K;
435
E T12, T1d, T1g, T1h;
436
T12 = FMA(KP766044443, TW, KP642787609 * T11);
437
T1d = FMA(KP173648177, T17, KP984807753 * T1c);
439
T1J = KP866025403 * (T1d - T12);
440
T1g = FNMS(KP642787609, TW, KP766044443 * T11);
441
T1h = FNMS(KP984807753, T17, KP173648177 * T1c);
442
T1i = KP866025403 * (T1g - T1h);
445
rio[WS(ios, 1)] = TR + T1e;
446
iio[-WS(ios, 1)] = T1F + T1I;
447
T1f = FNMS(KP500000000, T1e, TR);
448
iio[-WS(ios, 7)] = T1f - T1i;
449
rio[WS(ios, 4)] = T1f + T1i;
450
T1K = FNMS(KP500000000, T1F, T1I);
451
rio[WS(ios, 7)] = T1J - T1K;
452
iio[-WS(ios, 4)] = T1J + T1K;
455
E T1j, T1M, T1q, T1O, T1u, T1L, T1r, T1N;
459
E T1m, T1p, T1s, T1t;
460
T1m = FMA(KP173648177, T1k, KP984807753 * T1l);
461
T1p = FNMS(KP939692620, T1o, KP342020143 * T1n);
463
T1O = KP866025403 * (T1p - T1m);
464
T1s = FNMS(KP984807753, T1k, KP173648177 * T1l);
465
T1t = FMA(KP342020143, T1o, KP939692620 * T1n);
466
T1u = KP866025403 * (T1s + T1t);
469
rio[WS(ios, 2)] = T1j + T1q;
470
iio[-WS(ios, 2)] = T1L + T1M;
471
T1r = FNMS(KP500000000, T1q, T1j);
472
iio[-WS(ios, 8)] = T1r - T1u;
473
iio[-WS(ios, 5)] = T1r + T1u;
474
T1N = FMS(KP500000000, T1L, T1M);
475
rio[WS(ios, 5)] = T1N - T1O;
476
rio[WS(ios, 8)] = T1O + T1N;
482
static const tw_instr twinstr[] = {
487
static const hc2hc_desc desc = { 9, "hf_9", twinstr, &GENUS, {60, 36, 36, 0}, 0, 0, 0 };
489
void X(codelet_hf_9) (planner *p) {
490
X(khc2hc_register) (p, hf_9, &desc);
492
#endif /* HAVE_FMA */