39
39
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
40
40
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
42
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(rs)) {
43
E T2S, T2V, T2w, T2Z, T2T, T2I, T2Q, T2Y, T2U, T2K, T2G, T30, T2W;
45
E Tb, T1Z, T2D, T1E, T1N, T2y, TD, T2t, T1U, T1e, T2o, TY, T1f, TI, T1g;
46
E TN, Tm, T1V, T2z, T1H, T1Q, T2E, T19, T2u;
50
E Tu, T6, TT, TS, T5, Tt, Tw, Tx, TB, T9, Ty;
52
E T1, Tp, Tq, Tr, T4, T2, T3, T7, T8, Ts;
64
TS = FNMS(KP500000000, T4, T1);
43
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(rs)) {
44
E T2S, T2V, T2w, T2Z, T2T, T2I, T2Q, T2Y, T2U, T2K, T2G, T30, T2W;
46
E Tb, T1Z, T2D, T1E, T1N, T2y, TD, T2t, T1U, T1e, T2o, TY, T1f, TI, T1g;
47
E TN, Tm, T1V, T2z, T1H, T1Q, T2E, T19, T2u;
51
E Tu, T6, TT, TS, T5, Tt, Tw, Tx, TB, T9, Ty;
53
E T1, Tp, Tq, Tr, T4, T2, T3, T7, T8, Ts;
65
TS = FNMS(KP500000000, T4, T1);
70
Tt = FNMS(KP500000000, Ts, Tp);
78
E T1L, Tv, Ta, TV, TW, Tz;
79
T1L = FNMS(KP866025403, Tu, Tt);
80
Tv = FMA(KP866025403, Tu, Tt);
82
TV = FNMS(KP500000000, T9, T6);
86
E TC, T1M, T1C, TA, T1D;
87
T1C = FMA(KP866025403, TT, TS);
88
TU = FNMS(KP866025403, TT, TS);
90
TA = FNMS(KP500000000, Tz, Tw);
91
T1D = FNMS(KP866025403, TW, TV);
92
TX = FMA(KP866025403, TW, TV);
95
TC = FNMS(KP866025403, TB, TA);
96
T1M = FMA(KP866025403, TB, TA);
107
E T12, Th, TH, TE, Tg, T11, T14, TK, T17, Tk, TL;
109
E Tc, TZ, TF, TG, Tf, Td, Te, Ti, Tj, T10;
125
TE = FNMS(KP500000000, Tf, Tc);
130
T11 = FMA(KP500000000, T10, TZ);
138
E T1O, T13, Tl, TJ, TM, T15;
139
T1O = FNMS(KP866025403, T12, T11);
140
T13 = FMA(KP866025403, T12, T11);
142
TJ = FNMS(KP500000000, Tk, Th);
146
E T18, T1P, T1F, T16, T1G;
147
T1F = FNMS(KP866025403, TH, TE);
148
TI = FMA(KP866025403, TH, TE);
150
T16 = FMA(KP500000000, T15, T14);
151
T1G = FNMS(KP866025403, TM, TJ);
152
TN = FMA(KP866025403, TM, TJ);
155
T18 = FNMS(KP866025403, T17, T16);
156
T1P = FMA(KP866025403, T17, T16);
168
E T20, T2p, T1v, T1s, T1q, T1y, T1u, T1z, T1t;
170
E T1m, Tn, T1a, T1p, T1i, To, TP, TR, T1h, TO;
186
E T1l, T1o, T1n, T1x, T1r;
192
T1k = FNMS(TR, TP, T1j);
193
T1b = FMA(TR, T1a, TQ);
203
T1q = FNMS(T1o, T1p, T1n);
204
T1y = FMA(T1l, T1p, T1x);
211
E T2e, T2h, T1S, T2j, T2f, T26, T2c, T2m, T2g, T24, T22;
213
E T2b, T1R, T27, T2a, T1B, T29, T2l, T1K, T1J, T1W, T21, T25, T2d, T23, T1X;
216
E T1I, T28, T1A, T1w, T1T;
217
T1A = FNMS(T1u, T1s, T1z);
218
T1w = FMA(T1u, T1v, T1t);
223
Im[WS(rs, 3)] = T1A - T1y;
224
Ip[WS(rs, 3)] = T1y + T1A;
225
Rm[WS(rs, 3)] = T1q + T1w;
226
Rp[WS(rs, 3)] = T1q - T1w;
244
T1S = FNMS(T1K, T1R, T1J);
247
T26 = FMA(T1B, T1R, T25);
249
T2c = FNMS(T2a, T2b, T29);
250
T2m = FMA(T27, T2b, T2l);
252
T24 = FNMS(T1Y, T1W, T23);
253
T22 = FMA(T1Y, T21, T1X);
256
E T2L, T2O, T2P, T2v, T2N, T2X, T2n, T2s, T2A, T2F, T2r, T2H, T2R, T2J, T2B;
259
E T2q, T2k, T2i, T2M, T2x;
260
T2k = FNMS(T2g, T2e, T2j);
261
T2i = FMA(T2g, T2h, T2f);
262
Im[WS(rs, 1)] = T24 - T26;
263
Ip[WS(rs, 1)] = T24 + T26;
264
Rm[WS(rs, 1)] = T22 + T1S;
265
Rp[WS(rs, 1)] = T1S - T22;
266
Im[WS(rs, 4)] = T2k - T2m;
267
Ip[WS(rs, 4)] = T2k + T2m;
268
Rm[WS(rs, 4)] = T2i + T2c;
269
Rp[WS(rs, 4)] = T2c - T2i;
291
T2w = FNMS(T2s, T2v, T2r);
294
T2I = FMA(T2n, T2v, T2H);
296
T2Q = FNMS(T2O, T2P, T2N);
297
T2Y = FMA(T2L, T2P, T2X);
299
T2K = FNMS(T2C, T2A, T2J);
300
T2G = FMA(T2C, T2F, T2B);
305
T30 = FNMS(T2U, T2S, T2Z);
306
T2W = FMA(T2U, T2V, T2T);
307
Im[WS(rs, 2)] = T2K - T2I;
308
Ip[WS(rs, 2)] = T2I + T2K;
309
Rm[WS(rs, 2)] = T2w + T2G;
310
Rp[WS(rs, 2)] = T2w - T2G;
311
Im[WS(rs, 5)] = T30 - T2Y;
312
Ip[WS(rs, 5)] = T2Y + T30;
313
Rm[WS(rs, 5)] = T2Q + T2W;
314
Rp[WS(rs, 5)] = T2Q - T2W;
319
static const tw_instr twinstr[] = {
324
static const hc2c_desc desc = { 12, "hc2cbdft_12", twinstr, &GENUS, {96, 22, 46, 0} };
326
void X(codelet_hc2cbdft_12) (planner *p) {
327
X(khc2c_register) (p, hc2cbdft_12, &desc, HC2C_VIA_DFT);
331
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cbdft_12 -include hc2cb.h */
334
* This function contains 142 FP additions, 60 FP multiplications,
335
* (or, 112 additions, 30 multiplications, 30 fused multiply/add),
336
* 47 stack variables, 2 constants, and 48 memory accesses
340
static void hc2cbdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
342
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
343
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
346
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(rs)) {
347
E Tv, T1E, TC, T1F, TW, T1x, TT, T1w, T1d, T1N, Tb, T1R, TI, T1z, TN;
348
E T1A, T17, T1I, T12, T1H, T1g, T1S, Tm, T1O;
350
E T1, Tq, T6, TA, T4, Tp, Tt, TS, T9, Tw, Tz, TV;
360
Tp = KP866025403 * (T2 - T3);
364
TS = KP866025403 * (Tr + Ts);
371
Tw = KP866025403 * (T7 - T8);
375
TV = KP866025403 * (Tx + Ty);
379
Tu = FMA(KP500000000, Tt, Tq);
382
TB = FMS(KP500000000, Tz, TA);
385
TU = FNMS(KP500000000, T9, T6);
388
TR = FNMS(KP500000000, T4, T1);
69
Tt = FNMS(KP500000000, Ts, Tp);
77
E T1L, Tv, Ta, TV, TW, Tz;
78
T1L = FNMS(KP866025403, Tu, Tt);
79
Tv = FMA(KP866025403, Tu, Tt);
81
TV = FNMS(KP500000000, T9, T6);
85
E TC, T1M, T1C, TA, T1D;
86
T1C = FMA(KP866025403, TT, TS);
87
TU = FNMS(KP866025403, TT, TS);
89
TA = FNMS(KP500000000, Tz, Tw);
90
T1D = FNMS(KP866025403, TW, TV);
91
TX = FMA(KP866025403, TW, TV);
94
TC = FNMS(KP866025403, TB, TA);
95
T1M = FMA(KP866025403, TB, TA);
106
E T12, Th, TH, TE, Tg, T11, T14, TK, T17, Tk, TL;
405
E Tc, T10, Th, T15, Tf, TY, TH, TZ, Tk, T13, TM, T14;
415
TY = KP866025403 * (Td - Te);
418
TH = KP866025403 * (TF - TG);
426
T13 = KP866025403 * (Ti - Tj);
429
TM = KP866025403 * (TK - TL);
434
TE = FNMS(KP500000000, Tf, Tc);
437
TJ = FNMS(KP500000000, Tk, Th);
440
T16 = FMA(KP500000000, T14, T15);
443
T11 = FMA(KP500000000, TZ, T10);
108
E Tc, TZ, TF, TG, Tf, Td, Te, Ti, Tj, T10;
124
TE = FNMS(KP500000000, Tf, Tc);
129
T11 = FMA(KP500000000, T10, TZ);
137
E T1O, T13, Tl, TJ, TM, T15;
138
T1O = FNMS(KP866025403, T12, T11);
139
T13 = FMA(KP866025403, T12, T11);
141
TJ = FNMS(KP500000000, Tk, Th);
145
E T18, T1P, T1F, T16, T1G;
146
T1F = FNMS(KP866025403, TH, TE);
147
TI = FMA(KP866025403, TH, TE);
149
T16 = FMA(KP500000000, T15, T14);
150
T1G = FNMS(KP866025403, TM, TJ);
151
TN = FMA(KP866025403, TM, TJ);
154
T18 = FNMS(KP866025403, T17, T16);
155
T1P = FMA(KP866025403, T17, T16);
167
E T20, T2p, T1v, T1s, T1q, T1y, T1u, T1z, T1t;
460
E Tn, T1h, TP, T1p, T19, T1r, T1n, T1t;
169
E T1m, Tn, T1a, T1p, T1i, To, TP, TR, T1h, TO;
474
E T1k, T1m, T1j, T1l;
479
T1n = FNMS(T1l, T1m, T1j * T1k);
480
T1t = FMA(T1l, T1k, T1j * T1m);
185
E T1l, T1o, T1n, T1x, T1r;
191
T1k = FNMS(TR, TP, T1j);
192
T1b = FMA(TR, T1a, TQ);
202
T1q = FNMS(T1o, T1p, T1n);
203
T1y = FMA(T1l, T1p, T1x);
210
E T2e, T2h, T1S, T2j, T2f, T26, T2c, T2m, T2g, T24, T22;
212
E T2b, T1R, T27, T2a, T1B, T29, T2l, T1K, T1J, T1W, T21, T25, T2d, T23, T1X;
215
E T1I, T28, T1A, T1w, T1T;
216
T1A = FNMS(T1u, T1s, T1z);
217
T1w = FMA(T1u, T1v, T1t);
222
Im[WS(rs, 3)] = T1A - T1y;
223
Ip[WS(rs, 3)] = T1y + T1A;
224
Rm[WS(rs, 3)] = T1q + T1w;
225
Rp[WS(rs, 3)] = T1q - T1w;
243
T1S = FNMS(T1K, T1R, T1J);
246
T26 = FMA(T1B, T1R, T25);
248
T2c = FNMS(T2a, T2b, T29);
249
T2m = FMA(T27, T2b, T2l);
251
T24 = FNMS(T1Y, T1W, T23);
252
T22 = FMA(T1Y, T21, T1X);
255
E T2L, T2O, T2P, T2v, T2N, T2X, T2n, T2s, T2A, T2F, T2r, T2H, T2R, T2J, T2B;
258
E T2q, T2k, T2i, T2M, T2x;
259
T2k = FNMS(T2g, T2e, T2j);
260
T2i = FMA(T2g, T2h, T2f);
261
Im[WS(rs, 1)] = T24 - T26;
262
Ip[WS(rs, 1)] = T24 + T26;
263
Rm[WS(rs, 1)] = T22 + T1S;
264
Rp[WS(rs, 1)] = T1S - T22;
265
Im[WS(rs, 4)] = T2k - T2m;
266
Ip[WS(rs, 4)] = T2k + T2m;
267
Rm[WS(rs, 4)] = T2i + T2c;
268
Rp[WS(rs, 4)] = T2c - T2i;
290
T2w = FNMS(T2s, T2v, T2r);
293
T2I = FMA(T2n, T2v, T2H);
295
T2Q = FNMS(T2O, T2P, T2N);
296
T2Y = FMA(T2L, T2P, T2X);
298
T2K = FNMS(T2C, T2A, T2J);
299
T2G = FMA(T2C, T2F, T2B);
304
T30 = FNMS(T2U, T2S, T2Z);
305
T2W = FMA(T2U, T2V, T2T);
306
Im[WS(rs, 2)] = T2K - T2I;
307
Ip[WS(rs, 2)] = T2I + T2K;
308
Rm[WS(rs, 2)] = T2w + T2G;
309
Rp[WS(rs, 2)] = T2w - T2G;
310
Im[WS(rs, 5)] = T30 - T2Y;
311
Ip[WS(rs, 5)] = T2Y + T30;
312
Rm[WS(rs, 5)] = T2Q + T2W;
313
Rp[WS(rs, 5)] = T2Q - T2W;
317
static const tw_instr twinstr[] = {
322
static const hc2c_desc desc = { 12, "hc2cbdft_12", twinstr, &GENUS, {96, 22, 46, 0} };
324
void X(codelet_hc2cbdft_12) (planner *p) {
325
X(khc2c_register) (p, hc2cbdft_12, &desc, HC2C_VIA_DFT);
329
/* Generated by: ../../../genfft/gen_hc2cdft -compact -variables 4 -pipeline-latency 4 -sign 1 -n 12 -dif -name hc2cbdft_12 -include hc2cb.h */
332
* This function contains 142 FP additions, 60 FP multiplications,
333
* (or, 112 additions, 30 multiplications, 30 fused multiply/add),
334
* 47 stack variables, 2 constants, and 48 memory accesses
338
static void hc2cbdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
340
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
341
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
343
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(rs)) {
344
E Tv, T1E, TC, T1F, TW, T1x, TT, T1w, T1d, T1N, Tb, T1R, TI, T1z, TN;
345
E T1A, T17, T1I, T12, T1H, T1g, T1S, Tm, T1O;
347
E T1, Tq, T6, TA, T4, Tp, Tt, TS, T9, Tw, Tz, TV;
357
Tp = KP866025403 * (T2 - T3);
361
TS = KP866025403 * (Tr + Ts);
368
Tw = KP866025403 * (T7 - T8);
372
TV = KP866025403 * (Tx + Ty);
376
Tu = FMA(KP500000000, Tt, Tq);
379
TB = FMS(KP500000000, Tz, TA);
382
TU = FNMS(KP500000000, T9, T6);
385
TR = FNMS(KP500000000, T4, T1);
402
E Tc, T10, Th, T15, Tf, TY, TH, TZ, Tk, T13, TM, T14;
412
TY = KP866025403 * (Td - Te);
415
TH = KP866025403 * (TF - TG);
423
T13 = KP866025403 * (Ti - Tj);
426
TM = KP866025403 * (TK - TL);
431
TE = FNMS(KP500000000, Tf, Tc);
434
TJ = FNMS(KP500000000, Tk, Th);
437
T16 = FMA(KP500000000, T14, T15);
440
T11 = FMA(KP500000000, TZ, T10);
457
E Tn, T1h, TP, T1p, T19, T1r, T1n, T1t;
471
E T1k, T1m, T1j, T1l;
476
T1n = FNMS(T1l, T1m, T1j * T1k);
477
T1t = FMA(T1l, T1k, T1j * T1m);
484
T1a = FMA(To, TP, TQ * T19);
485
T1i = FNMS(TQ, TP, To * T19);
492
E T1s, T1u, T1o, T1q;
495
T1s = FMA(T1o, T1p, T1q * T1r);
496
T1u = FNMS(T1q, T1p, T1o * T1r);
497
Rp[WS(rs, 3)] = T1n - T1s;
498
Ip[WS(rs, 3)] = T1t + T1u;
499
Rm[WS(rs, 3)] = T1n + T1s;
500
Im[WS(rs, 3)] = T1u - T1t;
504
E T1C, T1Y, T1K, T20, T1U, T1V, T26, T27;
506
E T1y, T1B, T1G, T1J;
517
E T1P, T1T, T1M, T1Q;
522
T1U = FMA(T1M, T1P, T1Q * T1T);
523
T1V = FNMS(T1Q, T1P, T1M * T1T);
526
E T23, T25, T22, T24;
531
T26 = FMA(T22, T23, T24 * T25);
532
T27 = FNMS(T24, T23, T22 * T25);
535
E T1L, T1W, T1v, T1D;
538
T1L = FNMS(T1D, T1K, T1v * T1C);
539
T1W = FMA(T1D, T1C, T1v * T1K);
540
Rp[WS(rs, 1)] = T1L - T1U;
541
Ip[WS(rs, 1)] = T1V + T1W;
542
Rm[WS(rs, 1)] = T1U + T1L;
543
Im[WS(rs, 1)] = T1V - T1W;
546
E T21, T28, T1X, T1Z;
549
T21 = FNMS(T1Z, T20, T1X * T1Y);
550
T28 = FMA(T1Z, T1Y, T1X * T20);
551
Rp[WS(rs, 4)] = T21 - T26;
552
Ip[WS(rs, 4)] = T27 + T28;
553
Rm[WS(rs, 4)] = T26 + T21;
554
Im[WS(rs, 4)] = T27 - T28;
558
E T2c, T2u, T2p, T2B, T2g, T2w, T2l, T2z;
560
E T2a, T2b, T2n, T2o;
571
E T2e, T2f, T2j, T2k;
582
E T2h, T2r, T2q, T2s;
584
E T29, T2d, T2i, T2m;
587
T2h = FNMS(T2d, T2g, T29 * T2c);
588
T2r = FMA(T2d, T2c, T29 * T2g);
591
T2q = FMA(T2i, T2l, T2m * T2p);
592
T2s = FNMS(T2m, T2l, T2i * T2p);
594
Rp[WS(rs, 2)] = T2h - T2q;
595
Ip[WS(rs, 2)] = T2r + T2s;
596
Rm[WS(rs, 2)] = T2h + T2q;
597
Im[WS(rs, 2)] = T2s - T2r;
600
E T2x, T2D, T2C, T2E;
602
E T2t, T2v, T2y, T2A;
605
T2x = FNMS(T2v, T2w, T2t * T2u);
606
T2D = FMA(T2v, T2u, T2t * T2w);
609
T2C = FMA(T2y, T2z, T2A * T2B);
610
T2E = FNMS(T2A, T2z, T2y * T2B);
612
Rp[WS(rs, 5)] = T2x - T2C;
613
Ip[WS(rs, 5)] = T2D + T2E;
614
Rm[WS(rs, 5)] = T2x + T2C;
615
Im[WS(rs, 5)] = T2E - T2D;
487
T1a = FMA(To, TP, TQ * T19);
488
T1i = FNMS(TQ, TP, To * T19);
495
E T1s, T1u, T1o, T1q;
498
T1s = FMA(T1o, T1p, T1q * T1r);
499
T1u = FNMS(T1q, T1p, T1o * T1r);
500
Rp[WS(rs, 3)] = T1n - T1s;
501
Ip[WS(rs, 3)] = T1t + T1u;
502
Rm[WS(rs, 3)] = T1n + T1s;
503
Im[WS(rs, 3)] = T1u - T1t;
507
E T1C, T1Y, T1K, T20, T1U, T1V, T26, T27;
509
E T1y, T1B, T1G, T1J;
520
E T1P, T1T, T1M, T1Q;
525
T1U = FMA(T1M, T1P, T1Q * T1T);
526
T1V = FNMS(T1Q, T1P, T1M * T1T);
529
E T23, T25, T22, T24;
534
T26 = FMA(T22, T23, T24 * T25);
535
T27 = FNMS(T24, T23, T22 * T25);
538
E T1L, T1W, T1v, T1D;
541
T1L = FNMS(T1D, T1K, T1v * T1C);
542
T1W = FMA(T1D, T1C, T1v * T1K);
543
Rp[WS(rs, 1)] = T1L - T1U;
544
Ip[WS(rs, 1)] = T1V + T1W;
545
Rm[WS(rs, 1)] = T1U + T1L;
546
Im[WS(rs, 1)] = T1V - T1W;
549
E T21, T28, T1X, T1Z;
552
T21 = FNMS(T1Z, T20, T1X * T1Y);
553
T28 = FMA(T1Z, T1Y, T1X * T20);
554
Rp[WS(rs, 4)] = T21 - T26;
555
Ip[WS(rs, 4)] = T27 + T28;
556
Rm[WS(rs, 4)] = T26 + T21;
557
Im[WS(rs, 4)] = T27 - T28;
561
E T2c, T2u, T2p, T2B, T2g, T2w, T2l, T2z;
563
E T2a, T2b, T2n, T2o;
574
E T2e, T2f, T2j, T2k;
585
E T2h, T2r, T2q, T2s;
587
E T29, T2d, T2i, T2m;
590
T2h = FNMS(T2d, T2g, T29 * T2c);
591
T2r = FMA(T2d, T2c, T29 * T2g);
594
T2q = FMA(T2i, T2l, T2m * T2p);
595
T2s = FNMS(T2m, T2l, T2i * T2p);
597
Rp[WS(rs, 2)] = T2h - T2q;
598
Ip[WS(rs, 2)] = T2r + T2s;
599
Rm[WS(rs, 2)] = T2h + T2q;
600
Im[WS(rs, 2)] = T2s - T2r;
603
E T2x, T2D, T2C, T2E;
605
E T2t, T2v, T2y, T2A;
608
T2x = FNMS(T2v, T2w, T2t * T2u);
609
T2D = FMA(T2v, T2u, T2t * T2w);
612
T2C = FMA(T2y, T2z, T2A * T2B);
613
T2E = FNMS(T2A, T2z, T2y * T2B);
615
Rp[WS(rs, 5)] = T2x - T2C;
616
Ip[WS(rs, 5)] = T2D + T2E;
617
Rm[WS(rs, 5)] = T2x + T2C;
618
Im[WS(rs, 5)] = T2E - T2D;