2
* Copyright (c) 2003, 2006 Matteo Frigo
3
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Sun Jul 2 16:31:31 EDT 2006 */
24
#include "codelet-rdft.h"
28
/* Generated by: ../../../genfft/gen_hc2hc -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include hb.h */
31
* This function contains 174 FP additions, 100 FP multiplications,
32
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
33
* 83 stack variables, and 64 memory accesses
37
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
38
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
39
* $Id: gen_hc2hc.ml,v 1.16 2006-02-12 23:34:12 athena Exp $
44
static const R *hb_16(R *rio, R *iio, const R *W, stride ios, INT m, INT dist)
46
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
47
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
48
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
50
for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 30, MAKE_VOLATILE_STRIDE(ios)) {
51
E T3v, T3s, T3u, T3w, T3t;
53
E T26, T3j, T2z, T36, T11, T1K, T18, T1L, T1C, Tf, T37, T2d, T1m, TE, T3k;
54
E T2C, T1J, Tu, T20, T1F, T1a, TN, T3n, T3e, T1b, TW, T2k, T2h, T2F, T2s;
59
E T24, T3, T2y, T14, T2x, T6, T25, T17, Tb, T2b, Ta, T2a, Tz, Tc, TA;
66
T2 = iio[-WS(ios, 8)];
68
T13 = rio[WS(ios, 8)];
74
T5 = iio[-WS(ios, 12)];
75
T15 = iio[-WS(ios, 4)];
76
T16 = rio[WS(ios, 12)];
85
T9 = iio[-WS(ios, 10)];
86
Tx = iio[-WS(ios, 2)];
87
Ty = rio[WS(ios, 10)];
88
Tb = iio[-WS(ios, 14)];
94
TA = iio[-WS(ios, 6)];
95
TB = rio[WS(ios, 14)];
99
E T27, T28, TC, Te, Td, T7, T29, T2c;
127
E T2f, Ti, T2j, TI, T2i, Tl, T2g, TL, Tq, T2m, Tp, T2q, TR, Tr, TS;
133
Tg = rio[WS(ios, 1)];
138
Th = iio[-WS(ios, 9)];
139
TG = iio[-WS(ios, 1)];
140
TH = rio[WS(ios, 9)];
141
Tj = rio[WS(ios, 5)];
146
Tk = iio[-WS(ios, 13)];
147
TJ = iio[-WS(ios, 5)];
148
TK = rio[WS(ios, 13)];
152
Tn = iio[-WS(ios, 15)];
157
To = rio[WS(ios, 7)];
158
TP = iio[-WS(ios, 7)];
159
TQ = rio[WS(ios, 15)];
160
Tq = rio[WS(ios, 3)];
165
Tr = iio[-WS(ios, 11)];
166
TS = iio[-WS(ios, 3)];
167
TT = rio[WS(ios, 11)];
171
E TO, TV, T3c, T2r, T3d, T2o, T39, T3a;
173
E TF, Tm, T2p, T2n, TU, T1D, TM, Tt, Ts, T1E;
197
T3n = FMA(KP414213562, T3c, T3d);
198
T3e = FNMS(KP414213562, T3d, T3c);
205
T2F = FNMS(KP414213562, T2o, T2r);
206
T2s = FMA(KP414213562, T2r, T2o);
207
T3m = FMA(KP414213562, T39, T3a);
208
T3b = FNMS(KP414213562, T3a, T39);
213
E T2E, T2l, T1c, T19, TX, T1z, T1v, T1y, T1x, T1A;
215
E T1M, T1W, T21, T1V, T1Y, T1Z;
219
T2E = FMA(KP414213562, T2h, T2k);
220
T2l = FNMS(KP414213562, T2k, T2h);
225
iio[-WS(ios, 15)] = T20 + T1Z;
227
E T1G, T1T, T1N, T1P, T1B, T1U, T1I, T1H, T1O;
229
E T1S, T1R, T1X, T22, T1Q;
234
rio[WS(ios, 8)] = FNMS(T1Y, T21, T1X);
235
iio[-WS(ios, 7)] = FMA(T1V, T21, T22);
245
rio[WS(ios, 4)] = FNMS(T1S, T1T, T1R);
247
iio[-WS(ios, 11)] = FMA(T1P, T1T, T1U);
249
rio[WS(ios, 12)] = FNMS(T1I, T1N, T1H);
251
E T1r, T1s, T1w, T1o, T1n;
256
iio[-WS(ios, 3)] = FMA(T1B, T1N, T1O);
259
T1w = FNMS(KP707106781, T1n, T1m);
260
T1o = FMA(KP707106781, T1n, T1m);
262
E T1l, T1t, T1q, T1p, T1u;
264
T1t = FMA(KP707106781, T1s, T1r);
265
T1z = FNMS(KP707106781, T1s, T1r);
271
rio[WS(ios, 2)] = FNMS(T1q, T1t, T1p);
274
iio[-WS(ios, 13)] = FMA(T1l, T1t, T1u);
280
E T2V, T2R, T2Q, T2W, T2N, T2M, T2L;
282
E T1g, T1f, T1j, T1h, T1i, TY;
283
rio[WS(ios, 10)] = FNMS(T1y, T1z, T1x);
284
iio[-WS(ios, 5)] = FMA(T1v, T1z, T1A);
285
T1g = FNMS(KP707106781, TX, TE);
286
TY = FMA(KP707106781, TX, TE);
288
E Tv, T10, T1d, TZ, T1e;
292
T1j = FNMS(KP707106781, T1c, T19);
293
T1d = FMA(KP707106781, T1c, T19);
298
rio[WS(ios, 14)] = FNMS(T10, T1d, TZ);
299
iio[-WS(ios, 1)] = FMA(Tv, T1d, T1e);
302
E T2u, T2K, T2H, T23, T2w;
304
E T2e, T1k, T2t, T2D, T2G;
305
T2e = FMA(KP707106781, T2d, T26);
306
T2V = FNMS(KP707106781, T2d, T26);
307
rio[WS(ios, 6)] = FNMS(T1i, T1j, T1h);
311
T2D = FMA(KP707106781, T2C, T2z);
312
T2Q = FNMS(KP707106781, T2C, T2z);
315
iio[-WS(ios, 9)] = FMA(T1f, T1j, T1k);
316
T2u = FMA(KP923879532, T2t, T2e);
317
T2N = FNMS(KP923879532, T2t, T2e);
318
T2K = FNMS(KP923879532, T2G, T2D);
319
T2H = FMA(KP923879532, T2G, T2D);
324
E T2J, T2I, T2v, T2O;
331
iio[-WS(ios, 14)] = FMA(T2w, T2u, T2I);
332
rio[WS(ios, 1)] = FNMS(T2w, T2H, T2v);
333
rio[WS(ios, 9)] = FNMS(T2M, T2K, T2O);
337
iio[-WS(ios, 6)] = FMA(T2M, T2N, T2L);
339
E T33, T30, T32, T34, T31;
341
E T2P, T2S, T2X, T2U, T2T, T2Z, T2Y;
343
T33 = FNMS(KP923879532, T2R, T2Q);
344
T2S = FMA(KP923879532, T2R, T2Q);
345
T30 = FNMS(KP923879532, T2W, T2V);
346
T2X = FMA(KP923879532, T2W, T2V);
352
iio[-WS(ios, 2)] = FMA(T2U, T2X, T2T);
355
rio[WS(ios, 13)] = FNMS(T2U, T2S, T2Y);
358
E T3l, T3f, T38, T3o, T3L, T3I, T3K, T3M, T3J;
360
E T3y, T3z, T3D, T3E;
361
T3l = FMA(KP707106781, T3k, T3j);
362
T3y = FNMS(KP707106781, T3k, T3j);
363
iio[-WS(ios, 10)] = FMA(T32, T30, T34);
364
rio[WS(ios, 5)] = FNMS(T32, T33, T31);
367
T38 = FMA(KP707106781, T37, T36);
368
T3D = FNMS(KP707106781, T37, T36);
372
E T3x, T3A, T3F, T3C, T3B, T3H, T3G;
374
T3L = FMA(KP923879532, T3z, T3y);
375
T3A = FNMS(KP923879532, T3z, T3y);
376
T3I = FNMS(KP923879532, T3E, T3D);
377
T3F = FMA(KP923879532, T3E, T3D);
383
rio[WS(ios, 3)] = FNMS(T3C, T3F, T3B);
386
iio[-WS(ios, 12)] = FMA(T3C, T3A, T3G);
389
rio[WS(ios, 11)] = FNMS(T3K, T3I, T3M);
390
iio[-WS(ios, 4)] = FMA(T3K, T3L, T3J);
392
E T35, T3g, T3p, T3i, T3h, T3r, T3q;
394
T3v = FNMS(KP923879532, T3f, T38);
395
T3g = FMA(KP923879532, T3f, T38);
396
T3s = FNMS(KP923879532, T3o, T3l);
397
T3p = FMA(KP923879532, T3o, T3l);
403
iio[0] = FMA(T3i, T3p, T3h);
406
rio[WS(ios, 15)] = FNMS(T3i, T3g, T3q);
413
iio[-WS(ios, 8)] = FMA(T3u, T3s, T3w);
414
rio[WS(ios, 7)] = FNMS(T3u, T3v, T3t);
419
static const tw_instr twinstr[] = {
424
static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, {104, 30, 70, 0}, 0, 0, 0 };
426
void X(codelet_hb_16) (planner *p) {
427
X(khc2hc_register) (p, hb_16, &desc);
431
/* Generated by: ../../../genfft/gen_hc2hc -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include hb.h */
434
* This function contains 174 FP additions, 84 FP multiplications,
435
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
436
* 50 stack variables, and 64 memory accesses
440
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
441
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
442
* $Id: gen_hc2hc.ml,v 1.16 2006-02-12 23:34:12 athena Exp $
447
static const R *hb_16(R *rio, R *iio, const R *W, stride ios, INT m, INT dist)
449
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
450
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
451
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
453
for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 30, MAKE_VOLATILE_STRIDE(ios)) {
454
E T7, T2K, T30, Tw, T1a, T2e, T2k, T1B, Te, TD, T1C, T13, T2n, T2Z, T2b;
455
E T2L, Tm, T1v, TN, T10, T1W, T2p, T2P, T2W, Tt, T1w, TW, T11, T23, T2q;
458
E T3, T2c, T16, T2j, T6, T2i, T19, T2d;
462
T2 = iio[-WS(ios, 8)];
466
T15 = rio[WS(ios, 8)];
472
T4 = rio[WS(ios, 4)];
473
T5 = iio[-WS(ios, 12)];
476
T17 = iio[-WS(ios, 4)];
477
T18 = rio[WS(ios, 12)];
491
E Ta, T29, Tz, T28, Td, T25, TC, T26;
494
T8 = rio[WS(ios, 2)];
495
T9 = iio[-WS(ios, 10)];
498
Tx = iio[-WS(ios, 2)];
499
Ty = rio[WS(ios, 10)];
505
Tb = iio[-WS(ios, 14)];
506
Tc = rio[WS(ios, 6)];
509
TA = iio[-WS(ios, 6)];
510
TB = rio[WS(ios, 14)];
519
E T2l, T2m, T27, T2a;
522
T2n = KP707106781 * (T2l - T2m);
523
T2Z = KP707106781 * (T2l + T2m);
526
T2b = KP707106781 * (T27 - T2a);
527
T2L = KP707106781 * (T2a + T27);
531
E Ti, T1Q, TI, T1U, Tl, T1T, TL, T1R, TF, TM;
534
Tg = rio[WS(ios, 1)];
535
Th = iio[-WS(ios, 9)];
538
TG = iio[-WS(ios, 1)];
539
TH = rio[WS(ios, 9)];
545
Tj = rio[WS(ios, 5)];
546
Tk = iio[-WS(ios, 13)];
549
TJ = iio[-WS(ios, 5)];
550
TK = rio[WS(ios, 13)];
561
E T1S, T1V, T2N, T2O;
564
T1W = FNMS(KP382683432, T1V, KP923879532 * T1S);
565
T2p = FMA(KP923879532, T1V, KP382683432 * T1S);
568
T2P = FNMS(KP382683432, T2O, KP923879532 * T2N);
569
T2W = FMA(KP382683432, T2N, KP923879532 * T2O);
573
E Tp, T1X, TR, T21, Ts, T20, TU, T1Y, TO, TV;
576
Tn = iio[-WS(ios, 15)];
577
To = rio[WS(ios, 7)];
580
TP = iio[-WS(ios, 7)];
581
TQ = rio[WS(ios, 15)];
587
Tq = rio[WS(ios, 3)];
588
Tr = iio[-WS(ios, 11)];
591
TS = iio[-WS(ios, 3)];
592
TT = rio[WS(ios, 11)];
603
E T1Z, T22, T2Q, T2R;
606
T23 = FMA(KP923879532, T1Z, KP382683432 * T22);
607
T2q = FNMS(KP382683432, T1Z, KP923879532 * T22);
610
T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
611
T2X = FMA(KP923879532, T2Q, KP382683432 * T2R);
615
E Tf, Tu, T1K, T1M, T1N, T1O, T1J, T1L;
623
iio[-WS(ios, 15)] = T1N + T1M;
626
rio[WS(ios, 8)] = FNMS(T1L, T1O, T1J * T1K);
627
iio[-WS(ios, 7)] = FMA(T1L, T1K, T1J * T1O);
630
E T2U, T36, T32, T34;
632
E T2M, T2T, T2Y, T31;
643
E T2J, T2V, T33, T35;
646
iio[0] = FMA(T2J, T2U, T2V * T32);
647
rio[WS(ios, 15)] = FNMS(T2V, T2U, T2J * T32);
650
rio[WS(ios, 7)] = FNMS(T35, T36, T33 * T34);
651
iio[-WS(ios, 8)] = FMA(T33, T36, T35 * T34);
659
TX = KP707106781 * (TN + TW);
662
T12 = KP707106781 * (T10 + T11);
671
rio[WS(ios, 14)] = FNMS(TZ, T1c, Tv * TY);
672
iio[-WS(ios, 1)] = FMA(TZ, TY, Tv * T1c);
675
rio[WS(ios, 6)] = FNMS(T1f, T1g, T1d * T1e);
676
iio[-WS(ios, 9)] = FMA(T1f, T1e, T1d * T1g);
680
E T2g, T2w, T2s, T2u;
682
E T24, T2f, T2o, T2r;
693
E T1P, T2h, T2t, T2v;
696
rio[WS(ios, 1)] = FNMS(T2h, T2s, T1P * T2g);
697
iio[-WS(ios, 14)] = FMA(T1P, T2s, T2h * T2g);
700
iio[-WS(ios, 6)] = FMA(T2t, T2u, T2v * T2w);
701
rio[WS(ios, 9)] = FNMS(T2v, T2u, T2t * T2w);
705
E T1k, T1q, T1o, T1s;
707
E T1i, T1j, T1m, T1n;
709
T1j = KP707106781 * (T11 - T10);
712
T1m = KP707106781 * (TN - TW);
718
E T1h, T1l, T1p, T1r;
721
rio[WS(ios, 2)] = FNMS(T1l, T1o, T1h * T1k);
722
iio[-WS(ios, 13)] = FMA(T1l, T1k, T1h * T1o);
725
rio[WS(ios, 10)] = FNMS(T1r, T1s, T1p * T1q);
726
iio[-WS(ios, 5)] = FMA(T1r, T1q, T1p * T1s);
730
E T2A, T2I, T2E, T2G;
732
E T2y, T2z, T2C, T2D;
743
E T2x, T2B, T2F, T2H;
746
iio[-WS(ios, 2)] = FMA(T2x, T2A, T2B * T2E);
747
rio[WS(ios, 13)] = FNMS(T2B, T2A, T2x * T2E);
750
rio[WS(ios, 5)] = FNMS(T2H, T2I, T2F * T2G);
751
iio[-WS(ios, 10)] = FMA(T2F, T2I, T2H * T2G);
755
E T1y, T1G, T1E, T1I;
757
E T1u, T1x, T1A, T1D;
768
E T1t, T1z, T1F, T1H;
771
rio[WS(ios, 12)] = FNMS(T1z, T1E, T1t * T1y);
772
iio[-WS(ios, 3)] = FMA(T1z, T1y, T1t * T1E);
775
rio[WS(ios, 4)] = FNMS(T1H, T1I, T1F * T1G);
776
iio[-WS(ios, 11)] = FMA(T1H, T1G, T1F * T1I);
780
E T3a, T3i, T3e, T3g;
782
E T38, T39, T3c, T3d;
793
E T37, T3b, T3f, T3h;
796
rio[WS(ios, 3)] = FNMS(T3b, T3e, T37 * T3a);
797
iio[-WS(ios, 12)] = FMA(T37, T3e, T3b * T3a);
800
iio[-WS(ios, 4)] = FMA(T3f, T3g, T3h * T3i);
801
rio[WS(ios, 11)] = FNMS(T3h, T3g, T3f * T3i);
808
static const tw_instr twinstr[] = {
813
static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, {136, 46, 38, 0}, 0, 0, 0 };
815
void X(codelet_hb_16) (planner *p) {
816
X(khc2hc_register) (p, hb_16, &desc);
818
#endif /* HAVE_FMA */