2
* Copyright (c) 2003, 2006 Matteo Frigo
3
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Sun Jul 2 14:19:41 EDT 2006 */
24
#include "codelet-rdft.h"
28
/* Generated by: ../../../genfft/gen_r2hc -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 32 -name r2hc_32 -include r2hc.h */
31
* This function contains 156 FP additions, 68 FP multiplications,
32
* (or, 88 additions, 0 multiplications, 68 fused multiply/add),
33
* 89 stack variables, and 64 memory accesses
37
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
38
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
39
* $Id: gen_r2hc.ml,v 1.18 2006-02-12 23:34:12 athena Exp $
44
static void r2hc_32(const R *I, R *ro, R *io, stride is, stride ros, stride ios, INT v, INT ivs, INT ovs)
46
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
47
DK(KP668178637, +0.668178637919298919997757686523080761552472251);
48
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
49
DK(KP198912367, +0.198912367379658006911597622644676228597850501);
50
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
51
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
52
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
54
for (i = v; i > 0; i = i - 1, I = I + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(ros), MAKE_VOLATILE_STRIDE(ios)) {
55
E T1x, T1M, T1I, T1E, T1J, T1H;
57
E Tv, T1h, T7, T2b, Te, T2n, Ty, T1i, T1l, TF, T2d, Tt, T1k, TC, T2c;
58
E Tm, T2j, T1Z, T2k, T22, TK, T1B, T19, T1C, T1e, TO, TV, T1T, TN, TP;
69
E Ta, Tw, Tx, Td, Tn, To;
71
E T8, T3, T6, T9, Tb, Tc;
110
T1l = FNMS(KP414213562, TD, TE);
111
TF = FMA(KP414213562, TE, TD);
117
E T11, T15, T1c, T20, T14, T16, T1X, T1Y, T1Q, T1R;
119
E T1a, T1b, T12, T13;
129
T1k = FMA(KP414213562, TA, TB);
130
TC = FNMS(KP414213562, TB, TA);
149
E TI, T21, T17, TJ, T18, T1d;
161
T1B = FNMS(KP707106781, T18, T11);
162
T19 = FMA(KP707106781, T18, T11);
163
T1C = FNMS(KP707106781, T1d, T1c);
164
T1e = FMA(KP707106781, T1d, T1c);
182
E T1P, T25, T23, T2h, T1W, T1y, TS, T1z, TX, T27, T2a;
186
E T1U, TQ, T1V, TR, TW;
202
T1y = FNMS(KP707106781, TR, TK);
203
TS = FMA(KP707106781, TR, TK);
204
T1z = FNMS(KP707106781, TW, TV);
205
TX = FMA(KP707106781, TW, TV);
208
ro[WS(ros, 8)] = Tf - Tu;
209
io[WS(ios, 8)] = T29 - T28;
212
ro[WS(ros, 16)] = T27 - T2a;
214
E T2s, T2i, T2v, T2f, T2r, T2p, T2l, T2t;
216
E T2o, T2e, T26, T24;
219
T2s = FNMS(KP414213562, T2g, T2h);
220
T2i = FMA(KP414213562, T2h, T2g);
223
T2v = FNMS(KP707106781, T2e, T2b);
224
T2f = FMA(KP707106781, T2e, T2b);
225
T2r = FMA(KP707106781, T2o, T2n);
226
T2p = FNMS(KP707106781, T2o, T2n);
227
io[WS(ios, 4)] = FMA(KP707106781, T26, T25);
228
io[WS(ios, 12)] = FMS(KP707106781, T26, T25);
229
ro[WS(ros, 4)] = FMA(KP707106781, T24, T1P);
230
ro[WS(ros, 12)] = FNMS(KP707106781, T24, T1P);
231
T2l = FNMS(KP414213562, T2k, T2j);
232
T2t = FMA(KP414213562, T2j, T2k);
235
E T1v, T1G, TH, T1s, T1F, T1w, T1o, T1g, T1p, T1n;
237
E T1f, TY, T1t, T1u, T1j, T1m;
240
T1v = FNMS(KP707106781, Ty, Tv);
241
Tz = FMA(KP707106781, Ty, Tv);
243
E T2q, T2m, T2w, T2u;
248
io[WS(ios, 10)] = FMA(KP923879532, T2q, T2p);
249
io[WS(ios, 6)] = FMS(KP923879532, T2q, T2p);
250
ro[WS(ros, 2)] = FMA(KP923879532, T2m, T2f);
251
ro[WS(ros, 14)] = FNMS(KP923879532, T2m, T2f);
252
ro[WS(ros, 10)] = FNMS(KP923879532, T2w, T2v);
253
ro[WS(ros, 6)] = FMA(KP923879532, T2w, T2v);
254
io[WS(ios, 2)] = FMA(KP923879532, T2u, T2r);
255
io[WS(ios, 14)] = FMS(KP923879532, T2u, T2r);
259
T1f = FNMS(KP198912367, T1e, T19);
260
T1q = FMA(KP198912367, T19, T1e);
261
T1r = FMA(KP198912367, TS, TX);
262
TY = FNMS(KP198912367, TX, TS);
263
T1t = FNMS(KP923879532, TG, Tz);
264
TH = FMA(KP923879532, TG, Tz);
267
T1F = FMA(KP707106781, T1i, T1h);
268
T1j = FNMS(KP707106781, T1i, T1h);
272
ro[WS(ros, 7)] = FMA(KP980785280, T1u, T1t);
275
T1p = FMA(KP923879532, T1m, T1j);
276
T1n = FNMS(KP923879532, T1m, T1j);
277
ro[WS(ros, 9)] = FNMS(KP980785280, T1u, T1t);
279
ro[WS(ros, 1)] = FMA(KP980785280, T1g, TH);
280
ro[WS(ros, 15)] = FNMS(KP980785280, T1g, TH);
281
io[WS(ios, 1)] = FMS(KP980785280, T1s, T1p);
282
io[WS(ios, 15)] = FMA(KP980785280, T1s, T1p);
283
io[WS(ios, 9)] = FMS(KP980785280, T1o, T1n);
284
io[WS(ios, 7)] = FMA(KP980785280, T1o, T1n);
286
E T1A, T1D, T1N, T1O, T1K, T1L;
287
T1A = FMA(KP668178637, T1z, T1y);
288
T1K = FNMS(KP668178637, T1y, T1z);
289
T1L = FNMS(KP668178637, T1B, T1C);
290
T1D = FMA(KP668178637, T1C, T1B);
291
T1N = FNMS(KP923879532, T1w, T1v);
292
T1x = FMA(KP923879532, T1w, T1v);
295
ro[WS(ros, 5)] = FNMS(KP831469612, T1O, T1N);
298
T1J = FMA(KP923879532, T1G, T1F);
299
T1H = FNMS(KP923879532, T1G, T1F);
300
ro[WS(ros, 11)] = FMA(KP831469612, T1O, T1N);
306
io[WS(ios, 3)] = FMA(KP831469612, T1M, T1J);
307
ro[WS(ros, 3)] = FMA(KP831469612, T1E, T1x);
308
io[WS(ios, 13)] = FMS(KP831469612, T1M, T1J);
309
ro[WS(ros, 13)] = FNMS(KP831469612, T1E, T1x);
310
io[WS(ios, 11)] = FMA(KP831469612, T1I, T1H);
311
io[WS(ios, 5)] = FMS(KP831469612, T1I, T1H);
315
static const kr2hc_desc desc = { 32, "r2hc_32", {88, 0, 68, 0}, &GENUS, 0, 0, 0, 0, 0 };
317
void X(codelet_r2hc_32) (planner *p) {
318
X(kr2hc_register) (p, r2hc_32, &desc);
323
/* Generated by: ../../../genfft/gen_r2hc -compact -variables 4 -pipeline-latency 4 -n 32 -name r2hc_32 -include r2hc.h */
326
* This function contains 156 FP additions, 42 FP multiplications,
327
* (or, 140 additions, 26 multiplications, 16 fused multiply/add),
328
* 54 stack variables, and 64 memory accesses
332
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
333
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
334
* $Id: gen_r2hc.ml,v 1.18 2006-02-12 23:34:12 athena Exp $
339
static void r2hc_32(const R *I, R *ro, R *io, stride is, stride ros, stride ios, INT v, INT ivs, INT ovs)
341
DK(KP555570233, +0.555570233019602224742830813948532874374937191);
342
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
343
DK(KP195090322, +0.195090322016128267848284868477022240927691618);
344
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
345
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
346
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
347
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
349
for (i = v; i > 0; i = i - 1, I = I + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(ros), MAKE_VOLATILE_STRIDE(ios)) {
350
E T7, T2b, Tv, T1l, Te, T2o, Ty, T1k, Tt, T2d, TF, T1h, Tm, T2c, TC;
351
E T1i, T1Z, T22, T2k, T2j, T1e, T1C, T19, T1B, T1S, T1V, T2h, T2g, TX, T1z;
354
E T1, T2, T3, T4, T5, T6;
381
Ty = KP707106781 * (Tw + Tx);
382
T1k = KP707106781 * (Tx - Tw);
399
TF = FMA(KP923879532, TD, KP382683432 * TE);
400
T1h = FNMS(KP923879532, TE, KP382683432 * TD);
417
TC = FNMS(KP382683432, TB, KP923879532 * TA);
418
T1i = FMA(KP382683432, TA, KP923879532 * TB);
421
E T11, T1X, T1d, T1Y, T14, T20, T17, T21, T1a, T18;
434
E T12, T13, T15, T16;
448
T1a = KP707106781 * (T17 - T14);
451
T18 = KP707106781 * (T14 + T17);
456
E TK, T1Q, TW, T1R, TN, T1T, TQ, T1U, TT, TR;
483
TT = KP707106781 * (TQ - TN);
486
TR = KP707106781 * (TN + TQ);
491
E Tf, Tu, T27, T28, T29, T2a;
498
ro[WS(ros, 8)] = Tf - Tu;
499
io[WS(ios, 8)] = T29 - T28;
500
ro[WS(ros, 16)] = T27 - T2a;
504
E T1P, T25, T24, T26, T1W, T23;
509
T24 = KP707106781 * (T1W + T23);
510
T26 = KP707106781 * (T23 - T1W);
511
ro[WS(ros, 12)] = T1P - T24;
512
io[WS(ios, 12)] = T26 - T25;
513
ro[WS(ros, 4)] = T1P + T24;
514
io[WS(ios, 4)] = T25 + T26;
517
E T2f, T2v, T2p, T2r, T2m, T2q, T2u, T2w, T2e, T2n;
518
T2e = KP707106781 * (T2c + T2d);
521
T2n = KP707106781 * (T2d - T2c);
525
E T2i, T2l, T2s, T2t;
526
T2i = FMA(KP923879532, T2g, KP382683432 * T2h);
527
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
530
T2s = FNMS(KP382683432, T2g, KP923879532 * T2h);
531
T2t = FMA(KP382683432, T2j, KP923879532 * T2k);
535
ro[WS(ros, 14)] = T2f - T2m;
536
io[WS(ios, 14)] = T2u - T2r;
537
ro[WS(ros, 2)] = T2f + T2m;
538
io[WS(ios, 2)] = T2r + T2u;
539
io[WS(ios, 6)] = T2p + T2q;
540
ro[WS(ros, 6)] = T2v + T2w;
541
io[WS(ios, 10)] = T2q - T2p;
542
ro[WS(ros, 10)] = T2v - T2w;
545
E TH, T1t, T1s, T1u, T1g, T1o, T1n, T1p;
552
T1q = FNMS(KP195090322, TS, KP980785280 * TX);
553
T1r = FMA(KP195090322, T19, KP980785280 * T1e);
559
TY = FMA(KP980785280, TS, KP195090322 * TX);
560
T1f = FNMS(KP195090322, T1e, KP980785280 * T19);
568
ro[WS(ros, 15)] = TH - T1g;
569
io[WS(ios, 15)] = T1s - T1p;
570
ro[WS(ros, 1)] = TH + T1g;
571
io[WS(ios, 1)] = T1p + T1s;
572
io[WS(ios, 7)] = T1n + T1o;
573
ro[WS(ros, 7)] = T1t + T1u;
574
io[WS(ios, 9)] = T1o - T1n;
575
ro[WS(ros, 9)] = T1t - T1u;
578
E T1x, T1N, T1M, T1O, T1E, T1I, T1H, T1J;
580
E T1v, T1w, T1K, T1L;
585
T1K = FNMS(KP555570233, T1y, KP831469612 * T1z);
586
T1L = FMA(KP555570233, T1B, KP831469612 * T1C);
591
E T1A, T1D, T1F, T1G;
592
T1A = FMA(KP831469612, T1y, KP555570233 * T1z);
593
T1D = FNMS(KP555570233, T1C, KP831469612 * T1B);
601
ro[WS(ros, 13)] = T1x - T1E;
602
io[WS(ios, 13)] = T1M - T1J;
603
ro[WS(ros, 3)] = T1x + T1E;
604
io[WS(ios, 3)] = T1J + T1M;
605
io[WS(ios, 5)] = T1H + T1I;
606
ro[WS(ros, 5)] = T1N + T1O;
607
io[WS(ios, 11)] = T1I - T1H;
608
ro[WS(ros, 11)] = T1N - T1O;
613
static const kr2hc_desc desc = { 32, "r2hc_32", {140, 26, 16, 0}, &GENUS, 0, 0, 0, 0, 0 };
615
void X(codelet_r2hc_32) (planner *p) {
616
X(kr2hc_register) (p, r2hc_32, &desc);
619
#endif /* HAVE_FMA */