2
* Copyright (c) 2003, 2006 Matteo Frigo
3
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Sat Jul 1 13:59:13 EDT 2006 */
24
#include "codelet-dft.h"
28
/* Generated by: ../../../genfft/gen_twiddle -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include t.h */
31
* This function contains 174 FP additions, 100 FP multiplications,
32
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
33
* 97 stack variables, and 64 memory accesses
37
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
38
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
39
* $Id: gen_twiddle.ml,v 1.24 2006-02-12 23:34:12 athena Exp $
44
static const R *t1_16(R *ri, R *ii, const R *W, stride ios, INT m, INT dist)
46
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
47
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
48
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
50
for (i = m; i > 0; i = i - 1, ri = ri + dist, ii = ii + dist, W = W + 30, MAKE_VOLATILE_STRIDE(ios)) {
53
E T3z, T3o, T8, T1I, T2o, T35, T2r, T1s, T2w, T36, T2p, T1F, T3k, T1N, T3A;
54
E Tl, T1T, T2V, T1U, Tz, T29, T30, T2c, T11, TB, TE, T2h, T31, T2a, T1e;
55
E TC, T1X, TH, TK, TG, TD, TJ;
57
E Ta, Td, Tb, T1J, Tg, Tj, Tf, Tc, Ti;
59
E T1h, T1k, T1n, T2k, T1i, T1q, T1m, T1j, T1p;
61
E T1, T3n, T3, T6, T2, T5;
69
E T3l, T4, T1g, T3m, T7;
70
T1h = ri[WS(ios, 15)];
71
T1k = ii[WS(ios, 15)];
76
T3m = FNMS(T5, T3, T3l);
91
E T1u, T1x, T1v, T2s, T1A, T1D, T1z, T1w, T1C;
93
E T2l, T1l, T2n, T1r, T2m, T1o, T1t;
97
T2l = FNMS(T1j, T1h, T2k);
98
T1l = FMA(T1j, T1k, T1i);
99
T2n = FNMS(T1p, T1n, T2m);
100
T1r = FMA(T1p, T1q, T1o);
101
T1x = ii[WS(ios, 3)];
110
T1A = ri[WS(ios, 11)];
111
T1D = ii[WS(ios, 11)];
116
E T2t, T1y, T2v, T1E, T2u, T1B, T9;
120
T2t = FNMS(T1w, T1u, T2s);
121
T1y = FMA(T1w, T1x, T1v);
122
T2v = FNMS(T1C, T1A, T2u);
123
T1E = FMA(T1C, T1D, T1B);
133
Tg = ri[WS(ios, 12)];
134
Tj = ii[WS(ios, 12)];
141
E TQ, TT, TR, T25, TW, TZ, TV, TS, TY;
143
E To, Tr, Tp, T1P, Tu, Tx, Tt, Tq, Tw;
145
E T1K, Te, T1M, Tk, T1L, Th, Tn;
149
T1K = FNMS(Tc, Ta, T1J);
150
Te = FMA(Tc, Td, Tb);
151
T1M = FNMS(Ti, Tg, T1L);
152
Tk = FMA(Ti, Tj, Th);
162
Tu = ri[WS(ios, 10)];
163
Tx = ii[WS(ios, 10)];
168
E T1Q, Ts, T1S, Ty, T1R, Tv, TP;
172
T1Q = FNMS(Tq, To, T1P);
173
Ts = FMA(Tq, Tr, Tp);
174
T1S = FNMS(Tw, Tu, T1R);
175
Ty = FMA(Tw, Tx, Tv);
192
E T13, T16, T14, T2d, T19, T1c, T18, T15, T1b;
194
E T26, TU, T28, T10, T27, TX, T12;
195
T13 = ri[WS(ios, 5)];
198
T26 = FNMS(TS, TQ, T25);
199
TU = FMA(TS, TT, TR);
200
T28 = FNMS(TY, TW, T27);
201
T10 = FMA(TY, TZ, TX);
202
T16 = ii[WS(ios, 5)];
211
T19 = ri[WS(ios, 13)];
212
T1c = ii[WS(ios, 13)];
217
E T2e, T17, T2g, T1d, T2f, T1a, TA;
218
TB = ri[WS(ios, 14)];
221
T2e = FNMS(T15, T13, T2d);
222
T17 = FMA(T15, T16, T14);
223
T2g = FNMS(T1b, T19, T2f);
224
T1d = FMA(T1b, T1c, T1a);
225
TE = ii[WS(ios, 14)];
243
E T2U, T3u, T2Z, T21, T1W, T34, T2X, T3f, T32, T3t, T1H, T3q, T3e, TO, T3g;
244
E T37, T3r, T3s, T3h, T3i;
246
E Tm, T1Y, TF, T20, TL, T3p, T1Z, TI;
251
T1Y = FNMS(TD, TB, T1X);
252
TF = FMA(TD, TE, TC);
253
T20 = FNMS(TJ, TH, T1Z);
254
TL = FMA(TJ, TK, TI);
258
E T1f, TM, T1G, T3j, T2W, TN;
283
ii[WS(ios, 12)] = T3s - T3r;
284
ii[WS(ios, 4)] = T3r + T3s;
286
ri[WS(ios, 8)] = TO - T1H;
290
E T3a, T2Y, T3x, T3v, T3b, T33;
292
ii[WS(ios, 8)] = T3q - T3i;
293
ri[WS(ios, 4)] = T3e + T3h;
294
ri[WS(ios, 12)] = T3e - T3h;
302
E T2E, T1O, T3B, T3H, T2x, T2q, T3C, T23, T2S, T2O, T2K, T2J, T3I, T2H, T2B;
305
E T2F, T1V, T22, T2G, T3c, T38;
315
E T3d, T3w, T3y, T39;
320
ri[WS(ios, 6)] = FMA(KP707106781, T3d, T3a);
321
ri[WS(ios, 14)] = FNMS(KP707106781, T3d, T3a);
322
ii[WS(ios, 10)] = FNMS(KP707106781, T3w, T3v);
323
ii[WS(ios, 2)] = FMA(KP707106781, T3w, T3v);
324
ii[WS(ios, 14)] = FNMS(KP707106781, T3y, T3x);
325
ii[WS(ios, 6)] = FMA(KP707106781, T3y, T3x);
326
ri[WS(ios, 2)] = FMA(KP707106781, T39, T2Y);
327
ri[WS(ios, 10)] = FNMS(KP707106781, T39, T2Y);
332
E T2M, T2N, T2b, T2i;
339
T2S = FMA(KP414213562, T2M, T2N);
340
T2O = FNMS(KP414213562, T2N, T2M);
347
T2B = FNMS(KP414213562, T2b, T2i);
348
T2j = FMA(KP414213562, T2i, T2b);
352
E T2R, T2L, T3L, T3M;
354
E T2A, T24, T2C, T2y, T3J, T3K, T2D, T2z;
355
T2A = FNMS(KP707106781, T23, T1O);
356
T24 = FMA(KP707106781, T23, T1O);
357
T2R = FNMS(KP414213562, T2J, T2K);
358
T2L = FMA(KP414213562, T2K, T2J);
359
T2C = FMA(KP414213562, T2q, T2x);
360
T2y = FNMS(KP414213562, T2x, T2q);
361
T3J = FMA(KP707106781, T3I, T3H);
362
T3L = FNMS(KP707106781, T3I, T3H);
367
ii[WS(ios, 11)] = FNMS(KP923879532, T3K, T3J);
368
ii[WS(ios, 3)] = FMA(KP923879532, T3K, T3J);
369
ri[WS(ios, 3)] = FMA(KP923879532, T2z, T24);
370
ri[WS(ios, 11)] = FNMS(KP923879532, T2z, T24);
371
ri[WS(ios, 15)] = FMA(KP923879532, T2D, T2A);
372
ri[WS(ios, 7)] = FNMS(KP923879532, T2D, T2A);
375
E T2Q, T3D, T3E, T2T, T2I, T2P;
376
T2Q = FNMS(KP707106781, T2H, T2E);
377
T2I = FMA(KP707106781, T2H, T2E);
380
T3F = FNMS(KP707106781, T3C, T3B);
381
T3D = FMA(KP707106781, T3C, T3B);
382
ii[WS(ios, 15)] = FMA(KP923879532, T3M, T3L);
383
ii[WS(ios, 7)] = FNMS(KP923879532, T3M, T3L);
384
ri[WS(ios, 1)] = FMA(KP923879532, T2P, T2I);
385
ri[WS(ios, 9)] = FNMS(KP923879532, T2P, T2I);
388
ii[WS(ios, 9)] = FNMS(KP923879532, T3E, T3D);
389
ii[WS(ios, 1)] = FMA(KP923879532, T3E, T3D);
390
ri[WS(ios, 5)] = FMA(KP923879532, T2T, T2Q);
391
ri[WS(ios, 13)] = FNMS(KP923879532, T2T, T2Q);
398
ii[WS(ios, 13)] = FNMS(KP923879532, T3G, T3F);
399
ii[WS(ios, 5)] = FMA(KP923879532, T3G, T3F);
404
static const tw_instr twinstr[] = {
409
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, {104, 30, 70, 0}, 0, 0, 0 };
411
void X(codelet_t1_16) (planner *p) {
412
X(kdft_dit_register) (p, t1_16, &desc);
416
/* Generated by: ../../../genfft/gen_twiddle -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include t.h */
419
* This function contains 174 FP additions, 84 FP multiplications,
420
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
421
* 52 stack variables, and 64 memory accesses
425
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
426
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
427
* $Id: gen_twiddle.ml,v 1.24 2006-02-12 23:34:12 athena Exp $
432
static const R *t1_16(R *ri, R *ii, const R *W, stride ios, INT m, INT dist)
434
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
435
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
436
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
438
for (i = m; i > 0; i = i - 1, ri = ri + dist, ii = ii + dist, W = W + 30, MAKE_VOLATILE_STRIDE(ios)) {
439
E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
440
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
441
E T2y, T2z, T1O, T2g, T1T, T2h;
452
T6 = FMA(T2, T3, T4 * T5);
453
T2S = FNMS(T4, T3, T2 * T5);
468
Tc = FMA(T8, T9, Ta * Tb);
469
T1u = FNMS(Ta, T9, T8 * Tb);
473
Te = ri[WS(ios, 12)];
474
Tg = ii[WS(ios, 12)];
477
Th = FMA(Td, Te, Tf * Tg);
478
T1v = FNMS(Tf, Te, Td * Tg);
486
E To, T1y, Tt, T1z, T1A, T1B;
493
To = FMA(Tk, Tl, Tm * Tn);
494
T1y = FNMS(Tm, Tl, Tk * Tn);
498
Tq = ri[WS(ios, 10)];
499
Ts = ii[WS(ios, 10)];
502
Tt = FMA(Tp, Tq, Tr * Ts);
503
T1z = FNMS(Tr, Tq, Tp * Ts);
513
E Tz, T1E, TE, T1F, T1D, T1G;
516
Tw = ri[WS(ios, 14)];
517
Ty = ii[WS(ios, 14)];
520
Tz = FMA(Tv, Tw, Tx * Ty);
521
T1E = FNMS(Tx, Tw, Tv * Ty);
529
TE = FMA(TA, TB, TC * TD);
530
T1F = FNMS(TC, TB, TA * TD);
540
E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
542
E T16, T18, T15, T17;
543
T16 = ri[WS(ios, 15)];
544
T18 = ii[WS(ios, 15)];
547
T19 = FMA(T15, T16, T17 * T18);
548
T20 = FNMS(T17, T16, T15 * T18);
551
E T1m, T1o, T1l, T1n;
552
T1m = ri[WS(ios, 11)];
553
T1o = ii[WS(ios, 11)];
556
T1p = FMA(T1l, T1m, T1n * T1o);
557
T1X = FNMS(T1n, T1m, T1l * T1o);
560
E T1b, T1d, T1a, T1c;
561
T1b = ri[WS(ios, 7)];
562
T1d = ii[WS(ios, 7)];
565
T1e = FMA(T1a, T1b, T1c * T1d);
566
T21 = FNMS(T1c, T1b, T1a * T1d);
569
E T1h, T1j, T1g, T1i;
570
T1h = ri[WS(ios, 3)];
571
T1j = ii[WS(ios, 3)];
574
T1k = FMA(T1g, T1h, T1i * T1j);
575
T1W = FNMS(T1i, T1h, T1g * T1j);
584
E T1V, T1Y, T22, T23;
596
E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
603
TM = FMA(TI, TJ, TK * TL);
604
T1K = FNMS(TK, TJ, TI * TL);
608
TZ = ri[WS(ios, 13)];
609
T11 = ii[WS(ios, 13)];
612
T12 = FMA(TY, TZ, T10 * T11);
613
T1R = FNMS(T10, TZ, TY * T11);
621
TR = FMA(TN, TO, TP * TQ);
622
T1L = FNMS(TP, TO, TN * TQ);
630
TX = FMA(TT, TU, TV * TW);
631
T1Q = FNMS(TV, TU, TT * TW);
640
E T1M, T1N, T1P, T1S;
652
E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
654
E T1x, T1I, T3e, T3f;
656
T1I = KP707106781 * (T1C - T1H);
659
T3e = KP707106781 * (T2d - T2c);
665
E T1U, T25, T28, T29;
666
T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
667
T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
670
T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
671
T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
675
ri[WS(ios, 11)] = T1J - T26;
676
ii[WS(ios, 11)] = T3g - T3d;
677
ri[WS(ios, 3)] = T1J + T26;
678
ii[WS(ios, 3)] = T3d + T3g;
679
ri[WS(ios, 15)] = T27 - T2a;
680
ii[WS(ios, 15)] = T3i - T3h;
681
ri[WS(ios, 7)] = T27 + T2a;
682
ii[WS(ios, 7)] = T3h + T3i;
685
E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
687
E T2r, T2u, T30, T31;
698
E T2A, T2F, T2I, T2J;
701
T2G = KP707106781 * (T2A + T2F);
702
T33 = KP707106781 * (T2F - T2A);
705
T2K = KP707106781 * (T2I - T2J);
706
T2Z = KP707106781 * (T2I + T2J);
708
ri[WS(ios, 10)] = T2v - T2G;
709
ii[WS(ios, 10)] = T32 - T2Z;
710
ri[WS(ios, 2)] = T2v + T2G;
711
ii[WS(ios, 2)] = T2Z + T32;
712
ri[WS(ios, 14)] = T2H - T2K;
713
ii[WS(ios, 14)] = T34 - T33;
714
ri[WS(ios, 6)] = T2H + T2K;
715
ii[WS(ios, 6)] = T33 + T34;
718
E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
720
E T2b, T2e, T36, T39;
722
T2e = KP707106781 * (T2c + T2d);
725
T36 = KP707106781 * (T1C + T1H);
731
E T2i, T2l, T2o, T2p;
732
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
733
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
736
T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
737
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
741
ri[WS(ios, 9)] = T2f - T2m;
742
ii[WS(ios, 9)] = T3a - T35;
743
ri[WS(ios, 1)] = T2f + T2m;
744
ii[WS(ios, 1)] = T35 + T3a;
745
ri[WS(ios, 13)] = T2n - T2q;
746
ii[WS(ios, 13)] = T3c - T3b;
747
ri[WS(ios, 5)] = T2n + T2q;
748
ii[WS(ios, 5)] = T3b + T3c;
751
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
764
E T14, T1r, T2M, T2N;
774
ri[WS(ios, 8)] = TH - T1s;
775
ii[WS(ios, 8)] = T2W - T2P;
778
ri[WS(ios, 12)] = T2L - T2O;
779
ii[WS(ios, 12)] = T2Y - T2X;
780
ri[WS(ios, 4)] = T2L + T2O;
781
ii[WS(ios, 4)] = T2X + T2Y;
787
static const tw_instr twinstr[] = {
792
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, {136, 46, 38, 0}, 0, 0, 0 };
794
void X(codelet_t1_16) (planner *p) {
795
X(kdft_dit_register) (p, t1_16, &desc);
797
#endif /* HAVE_FMA */