2
* Copyright (c) 2003, 2006 Matteo Frigo
3
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Sun Jul 2 16:31:42 EDT 2006 */
24
#include "codelet-rdft.h"
28
/* Generated by: ../../../genfft/gen_hc2hc -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hb_32 -include hb.h */
31
* This function contains 434 FP additions, 260 FP multiplications,
32
* (or, 236 additions, 62 multiplications, 198 fused multiply/add),
33
* 141 stack variables, and 128 memory accesses
37
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
38
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
39
* $Id: gen_hc2hc.ml,v 1.16 2006-02-12 23:34:12 athena Exp $
44
static const R *hb_32(R *rio, R *iio, const R *W, stride ios, INT m, INT dist)
46
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
47
DK(KP198912367, +0.198912367379658006911597622644676228597850501);
48
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
49
DK(KP668178637, +0.668178637919298919997757686523080761552472251);
50
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
51
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
52
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
54
for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 62, MAKE_VOLATILE_STRIDE(ios)) {
55
E T86, T81, T8b, T7Y, T7Z, T82, T80, T83, T88, T8c, T87;
57
E T3y, Tf, T8x, T7k, T8k, T7N, T2Q, T1a, T33, T2j, T6N, T50, T6A, T63, T4w;
58
E T3U, TZ, T4q, T26, T2p, T8q, T7F, T37, T2Y, T6H, T5K, T8r, T7C, T3X, T3M;
59
E T6G, T5R, Tu, T3R, T2m, T2R, T1t, T34, T8y, T7Q, T4x, T3B, T6O, T66, T8l;
60
E T7r, T6B, T5f, TK, T4p, T5x, T1N, T2o, T8n, T7y, T5u, T36, T2V, T6E, T5r;
61
E T8o, T7v, T3W, T3H, T5Q, T5N;
65
E T4Q, T3, T5Y, T2e, T5X, T6, T4R, T2h, Td, T18, T61, T4V, Ta, T4X, T4W;
72
T2 = iio[-WS(ios, 16)];
74
T2d = rio[WS(ios, 16)];
80
T5 = iio[-WS(ios, 24)];
82
T2f = iio[-WS(ios, 8)];
83
T2g = rio[WS(ios, 24)];
86
Tb = iio[-WS(ios, 28)];
91
Tc = rio[WS(ios, 12)];
92
T16 = iio[-WS(ios, 12)];
93
T17 = rio[WS(ios, 28)];
95
E T8, T4T, T4U, T9, T13, T14;
101
T9 = iio[-WS(ios, 20)];
102
T13 = iio[-WS(ios, 4)];
103
T14 = rio[WS(ios, 20)];
114
E T12, T2b, T4Z, T4S, T7L, T7M, T19, T2i;
116
E T7, T60, Te, T7i, T7j, T4Y;
131
T8x = FNMS(KP707106781, T7j, T7i);
132
T7k = FMA(KP707106781, T7j, T7i);
138
T8k = FNMS(KP707106781, T7M, T7L);
139
T7N = FMA(KP707106781, T7M, T7L);
146
T6N = FNMS(KP707106781, T4Z, T4S);
147
T50 = FMA(KP707106781, T4Z, T4S);
151
E T5A, T5L, TR, T1O, T5M, T5B, T3J, T24, T5F, T5O, TY, T1X, T5I, T5P, T1V;
154
E TL, TM, TO, TP, T20, T23;
155
TL = iio[-WS(ios, 31)];
156
T6A = FNMS(KP707106781, T62, T5Z);
157
T63 = FMA(KP707106781, T62, T5Z);
160
TM = rio[WS(ios, 15)];
161
TO = rio[WS(ios, 7)];
162
TP = iio[-WS(ios, 23)];
164
E T1Y, TN, TQ, T1Z, T21, T22;
165
T1Y = iio[-WS(ios, 15)];
170
T1Z = rio[WS(ios, 31)];
171
T21 = iio[-WS(ios, 7)];
172
T22 = rio[WS(ios, 23)];
181
E TV, T5D, TU, T5E, T1R, TW, T1S, T1T;
184
TS = rio[WS(ios, 3)];
187
TT = iio[-WS(ios, 19)];
188
T1P = iio[-WS(ios, 3)];
189
T1Q = rio[WS(ios, 19)];
190
TV = iio[-WS(ios, 27)];
195
TW = rio[WS(ios, 11)];
196
T1S = iio[-WS(ios, 11)];
197
T1T = rio[WS(ios, 27)];
217
E T3I, T2W, T2X, T3L, T5C, T7D, T7E, T1W, T25, T5J, T7B, T7A;
230
T26 = FNMS(KP414213562, T25, T1W);
231
T2p = FMA(KP414213562, T1W, T25);
232
T8q = FNMS(KP707106781, T7E, T7D);
233
T7F = FMA(KP707106781, T7E, T7D);
238
T37 = FNMS(KP414213562, T2W, T2X);
239
T2Y = FMA(KP414213562, T2X, T2W);
240
T6H = FNMS(KP707106781, T5J, T5C);
241
T5K = FMA(KP707106781, T5J, T5C);
242
T8r = FNMS(KP707106781, T7B, T7A);
243
T7C = FMA(KP707106781, T7B, T7A);
250
E T7n, T7q, T57, T5e;
252
E T56, T7l, T1b, Tm, T7m, T53, T3z, T1i, T58, Tp, T1o, T5c, T1n, T5b, Ts;
255
E T51, Ti, T1f, T55, T1e, T54, Tl, T1g;
257
E T1c, T1d, Tg, Th, Tj, Tk;
258
Tg = rio[WS(ios, 2)];
259
Th = iio[-WS(ios, 18)];
260
T1c = iio[-WS(ios, 2)];
261
T6G = FNMS(KP707106781, T5Q, T5N);
262
T5R = FMA(KP707106781, T5Q, T5N);
265
T1d = rio[WS(ios, 18)];
266
Tj = rio[WS(ios, 10)];
267
Tk = iio[-WS(ios, 26)];
268
T1f = iio[-WS(ios, 10)];
273
T1g = rio[WS(ios, 26)];
279
Tn = iio[-WS(ios, 30)];
286
To = rio[WS(ios, 14)];
287
T1l = iio[-WS(ios, 14)];
294
T1m = rio[WS(ios, 30)];
296
Tq = rio[WS(ios, 6)];
297
Tr = iio[-WS(ios, 22)];
298
T1o = iio[-WS(ios, 6)];
303
T1p = rio[WS(ios, 22)];
307
E T5d, T5a, T3A, T64, T65;
309
E T2k, T1j, T7o, T1k, Tt, T1q, T59, T7p, T1r;
323
E T7O, T7P, T1s, T2l;
324
T7n = FNMS(KP414213562, T7m, T7l);
325
T7O = FMA(KP414213562, T7l, T7m);
326
T7P = FMA(KP414213562, T7o, T7p);
327
T7q = FNMS(KP414213562, T7p, T7o);
340
T57 = FNMS(KP414213562, T56, T53);
341
T64 = FMA(KP414213562, T53, T56);
342
T65 = FNMS(KP414213562, T5a, T5d);
343
T5e = FMA(KP414213562, T5d, T5a);
351
E T5h, T5s, TC, T1v, T5t, T5i, T3E, T1L, T5p, T5v, TJ, T1E, T5w, T5m, T3F;
354
E Tw, Tx, Tz, TA, T1H, T1K;
355
Tw = rio[WS(ios, 1)];
360
Tx = iio[-WS(ios, 17)];
361
Tz = rio[WS(ios, 9)];
362
TA = iio[-WS(ios, 25)];
364
E T1F, Ty, TB, T1G, T1I, T1J;
365
T1F = iio[-WS(ios, 1)];
370
T1G = rio[WS(ios, 17)];
371
T1I = iio[-WS(ios, 9)];
372
T1J = rio[WS(ios, 25)];
381
E TG, T5o, TF, T5n, T1y, TH, T1z, T1A;
384
TD = rio[WS(ios, 5)];
387
TE = iio[-WS(ios, 21)];
388
T1w = iio[-WS(ios, 5)];
389
T1x = rio[WS(ios, 21)];
390
TG = iio[-WS(ios, 29)];
395
TH = rio[WS(ios, 13)];
396
T1z = iio[-WS(ios, 13)];
397
T1A = rio[WS(ios, 29)];
417
E T3D, T2U, T2T, T3G, T5j, T7w, T7x, T1M, T1D, T5q, T7u, T7t;
430
T1N = FMA(KP414213562, T1M, T1D);
431
T2o = FNMS(KP414213562, T1D, T1M);
432
T8n = FNMS(KP707106781, T7x, T7w);
433
T7y = FMA(KP707106781, T7x, T7w);
438
T36 = FMA(KP414213562, T2T, T2U);
439
T2V = FNMS(KP414213562, T2U, T2T);
440
T6E = FNMS(KP707106781, T5q, T5j);
441
T5r = FMA(KP707106781, T5q, T5j);
442
T8o = FNMS(KP707106781, T7u, T7t);
443
T7v = FMA(KP707106781, T7u, T7t);
450
E T6D, T5y, T2C, T2H, T2E, T2z;
452
E T4o, T4v, T4y, T4r, T3C, T3Y, T3V, T4i, T3N, T4h, T4l, T4j, T4k;
454
E T4H, T4K, T4N, T4J, T4O;
456
E Tv, T10, T4L, T4I, T4M;
463
T6D = FNMS(KP707106781, T5x, T5u);
464
T5y = FMA(KP707106781, T5x, T5u);
471
iio[-WS(ios, 31)] = T4M + T4L;
477
E T4d, T4a, T4e, T48, T49;
482
rio[WS(ios, 16)] = FNMS(T4K, T4N, T4J);
483
iio[-WS(ios, 15)] = FMA(T4H, T4N, T4O);
486
T4i = FNMS(KP707106781, T49, T48);
487
T4a = FMA(KP707106781, T49, T48);
491
E T47, T4c, T4f, T4b, T4g;
495
T4l = FNMS(KP707106781, T4e, T4d);
496
T4f = FMA(KP707106781, T4e, T4d);
501
rio[WS(ios, 4)] = FNMS(T4c, T4f, T4b);
502
iio[-WS(ios, 27)] = FMA(T47, T4f, T4g);
507
E T4s, T4C, T4F, T4z, T4m, T4n, T4u;
508
rio[WS(ios, 20)] = FNMS(T4k, T4l, T4j);
510
iio[-WS(ios, 11)] = FMA(T4h, T4l, T4m);
518
E T4B, T4E, T4t, T4A, T4D, T4G;
525
rio[WS(ios, 24)] = FNMS(T4u, T4z, T4t);
526
iio[-WS(ios, 7)] = FMA(T4n, T4z, T4A);
527
rio[WS(ios, 8)] = FNMS(T4E, T4F, T4D);
528
iio[-WS(ios, 23)] = FMA(T4B, T4F, T4G);
532
E T45, T41, T44, T43, T46;
534
E T3x, T42, T3O, T3Z, T3Q, T3P, T40;
536
T42 = FNMS(KP707106781, T3N, T3C);
537
T3O = FMA(KP707106781, T3N, T3C);
538
T45 = FNMS(KP707106781, T3Y, T3V);
539
T3Z = FMA(KP707106781, T3Y, T3V);
545
rio[WS(ios, 28)] = FNMS(T3Q, T3Z, T3P);
548
iio[-WS(ios, 3)] = FMA(T3x, T3Z, T40);
551
E T2B, T2q, T2G, T2A, T2n, T2F, T28, T2u, T27, T1u;
554
rio[WS(ios, 12)] = FNMS(T44, T45, T43);
555
iio[-WS(ios, 19)] = FMA(T41, T45, T46);
558
T1u = FMA(KP707106781, T1t, T1a);
559
T2A = FNMS(KP707106781, T1t, T1a);
560
T2n = FMA(KP707106781, T2m, T2j);
561
T2F = FNMS(KP707106781, T2m, T2j);
562
T28 = FMA(KP923879532, T27, T1u);
563
T2u = FNMS(KP923879532, T27, T1u);
565
E T2x, T2K, T2N, T2M, T2J, T2O;
567
E T2r, T2a, T11, T2s, T29;
568
T2r = FMA(KP923879532, T2q, T2n);
569
T2x = FNMS(KP923879532, T2q, T2n);
572
T2C = FMA(KP923879532, T2B, T2A);
573
T2K = FNMS(KP923879532, T2B, T2A);
576
T2H = FMA(KP923879532, T2G, T2F);
577
T2N = FNMS(KP923879532, T2G, T2F);
579
iio[-WS(ios, 1)] = FMA(T11, T2r, T2s);
580
rio[WS(ios, 30)] = FNMS(T2a, T2r, T29);
585
E T2w, T2t, T2y, T2L, T2v;
588
iio[-WS(ios, 9)] = FMA(T2J, T2N, T2O);
591
rio[WS(ios, 22)] = FNMS(T2M, T2N, T2L);
593
iio[-WS(ios, 17)] = FMA(T2t, T2x, T2y);
596
rio[WS(ios, 14)] = FNMS(T2w, T2x, T2v);
603
E T74, T6Z, T79, T6W, T6X, T70, T6Y, T71, T76;
605
E T3p, T3m, T3h, T3q, T3l;
607
E T38, T3j, T2Z, T3o, T3i, T2S, T3n, T35, T2I, T2D;
614
iio[-WS(ios, 25)] = FMA(T2z, T2H, T2I);
615
rio[WS(ios, 6)] = FNMS(T2E, T2H, T2D);
616
T3i = FNMS(KP707106781, T2R, T2Q);
617
T2S = FMA(KP707106781, T2R, T2Q);
618
T3n = FNMS(KP707106781, T34, T33);
619
T35 = FMA(KP707106781, T34, T33);
621
E T3f, T3c, T3k, T3v, T3u, T3t;
623
E T39, T30, T3s, T32, T2P;
624
T3f = FNMS(KP923879532, T38, T35);
625
T39 = FMA(KP923879532, T38, T35);
626
T3c = FNMS(KP923879532, T2Z, T2S);
627
T30 = FMA(KP923879532, T2Z, T2S);
628
T3s = FNMS(KP923879532, T3j, T3i);
629
T3k = FMA(KP923879532, T3j, T3i);
633
E T3r, T3w, T3a, T31;
634
T3v = FNMS(KP923879532, T3o, T3n);
635
T3p = FMA(KP923879532, T3o, T3n);
641
iio[-WS(ios, 29)] = FMA(T2P, T39, T3a);
642
rio[WS(ios, 2)] = FNMS(T32, T39, T31);
644
iio[-WS(ios, 21)] = FMA(T3r, T3v, T3w);
648
E T3e, T3b, T3g, T3d;
650
rio[WS(ios, 10)] = FNMS(T3u, T3v, T3t);
656
iio[-WS(ios, 13)] = FMA(T3b, T3f, T3g);
658
rio[WS(ios, 18)] = FNMS(T3e, T3f, T3d);
664
E T77, T6C, T6P, T72, T6Q, T6R, T73, T6J, T6F, T6I;
665
T77 = FNMS(KP923879532, T6B, T6A);
666
T6C = FMA(KP923879532, T6B, T6A);
667
iio[-WS(ios, 5)] = FMA(T3h, T3p, T3q);
668
T6P = FMA(KP923879532, T6O, T6N);
669
T72 = FNMS(KP923879532, T6O, T6N);
670
rio[WS(ios, 26)] = FNMS(T3m, T3p, T3l);
671
T6Q = FMA(KP668178637, T6D, T6E);
672
T6F = FNMS(KP668178637, T6E, T6D);
673
T6I = FMA(KP668178637, T6H, T6G);
674
T6R = FNMS(KP668178637, T6G, T6H);
678
E T6z, T7f, T6K, T6M, T6L, T7c, T6T, T7b, T7e, T78, T6S;
680
T74 = FMA(KP831469612, T73, T72);
681
T7f = FNMS(KP831469612, T73, T72);
682
T6Z = FNMS(KP831469612, T6J, T6C);
683
T6K = FMA(KP831469612, T6J, T6C);
688
T79 = FMA(KP831469612, T78, T77);
689
T7c = FNMS(KP831469612, T78, T77);
690
T6W = FNMS(KP831469612, T6S, T6P);
691
T6T = FMA(KP831469612, T6S, T6P);
695
E T6V, T6U, T7g, T7d;
697
iio[-WS(ios, 2)] = FMA(T6M, T6T, T6L);
702
rio[WS(ios, 29)] = FNMS(T6M, T6K, T6U);
703
rio[WS(ios, 21)] = FNMS(T7e, T7c, T7g);
704
iio[-WS(ios, 10)] = FMA(T7e, T7f, T7d);
714
E T6m, T6h, T6r, T6e, T6f, T6i, T6g, T6j, T6o;
716
E T6p, T5g, T6k, T67, T68, T69, T6l, T5T;
718
E T5z, T5S, T7a, T75;
719
T6p = FNMS(KP923879532, T5f, T50);
720
T5g = FMA(KP923879532, T5f, T50);
721
iio[-WS(ios, 18)] = FMA(T6Y, T6W, T70);
722
rio[WS(ios, 13)] = FNMS(T6Y, T6Z, T6X);
725
T6k = FNMS(KP923879532, T66, T63);
726
T67 = FMA(KP923879532, T66, T63);
727
T68 = FMA(KP198912367, T5r, T5y);
728
T5z = FNMS(KP198912367, T5y, T5r);
729
iio[-WS(ios, 26)] = FMA(T76, T74, T7a);
730
rio[WS(ios, 5)] = FNMS(T76, T79, T75);
731
T5S = FMA(KP198912367, T5R, T5K);
732
T69 = FNMS(KP198912367, T5K, T5R);
737
E T4P, T6x, T5U, T5W, T5V, T6u, T6b, T6t, T6w, T6q, T6a;
739
T6m = FMA(KP980785280, T6l, T6k);
740
T6x = FNMS(KP980785280, T6l, T6k);
741
T6h = FNMS(KP980785280, T5T, T5g);
742
T5U = FMA(KP980785280, T5T, T5g);
747
T6r = FMA(KP980785280, T6q, T6p);
748
T6u = FNMS(KP980785280, T6q, T6p);
749
T6e = FNMS(KP980785280, T6a, T67);
750
T6b = FMA(KP980785280, T6a, T67);
754
E T6d, T6c, T6y, T6v;
756
rio[WS(ios, 1)] = FNMS(T5W, T6b, T5V);
761
iio[-WS(ios, 30)] = FMA(T5W, T5U, T6c);
762
iio[-WS(ios, 22)] = FMA(T6w, T6u, T6y);
763
rio[WS(ios, 9)] = FNMS(T6w, T6x, T6v);
772
E T8O, T8J, T8T, T8G, T8H, T8K, T8I, T8L, T8Q;
774
E T8R, T8m, T8M, T8z, T8A, T8B, T8N, T8t;
776
E T8p, T8s, T6s, T6n;
777
T8R = FMA(KP923879532, T8l, T8k);
778
T8m = FNMS(KP923879532, T8l, T8k);
779
rio[WS(ios, 17)] = FNMS(T6g, T6e, T6i);
780
iio[-WS(ios, 14)] = FMA(T6g, T6h, T6f);
783
T8M = FNMS(KP923879532, T8y, T8x);
784
T8z = FMA(KP923879532, T8y, T8x);
785
T8A = FMA(KP668178637, T8n, T8o);
786
T8p = FNMS(KP668178637, T8o, T8n);
787
rio[WS(ios, 25)] = FNMS(T6o, T6m, T6s);
788
iio[-WS(ios, 6)] = FMA(T6o, T6r, T6n);
789
T8s = FNMS(KP668178637, T8r, T8q);
790
T8B = FMA(KP668178637, T8q, T8r);
795
E T8j, T8Z, T8u, T8w, T8v, T8W, T8D, T8V, T8Y, T8S, T8C;
797
T8O = FMA(KP831469612, T8N, T8M);
798
T8Z = FNMS(KP831469612, T8N, T8M);
799
T8J = FNMS(KP831469612, T8t, T8m);
800
T8u = FMA(KP831469612, T8t, T8m);
805
T8T = FMA(KP831469612, T8S, T8R);
806
T8W = FNMS(KP831469612, T8S, T8R);
807
T8G = FNMS(KP831469612, T8C, T8z);
808
T8D = FMA(KP831469612, T8C, T8z);
812
E T8F, T8E, T90, T8X;
814
rio[WS(ios, 3)] = FNMS(T8w, T8D, T8v);
819
iio[-WS(ios, 28)] = FMA(T8w, T8u, T8E);
820
iio[-WS(ios, 20)] = FMA(T8Y, T8W, T90);
821
rio[WS(ios, 11)] = FNMS(T8Y, T8Z, T8X);
830
E T89, T7s, T84, T7R, T7S, T7T, T85, T7H;
832
E T7z, T7G, T8U, T8P;
833
T89 = FNMS(KP923879532, T7r, T7k);
834
T7s = FMA(KP923879532, T7r, T7k);
835
rio[WS(ios, 19)] = FNMS(T8I, T8G, T8K);
836
iio[-WS(ios, 12)] = FMA(T8I, T8J, T8H);
839
T84 = FNMS(KP923879532, T7Q, T7N);
840
T7R = FMA(KP923879532, T7Q, T7N);
841
T7S = FMA(KP198912367, T7v, T7y);
842
T7z = FNMS(KP198912367, T7y, T7v);
843
rio[WS(ios, 27)] = FNMS(T8Q, T8O, T8U);
844
iio[-WS(ios, 4)] = FMA(T8Q, T8T, T8P);
845
T7G = FNMS(KP198912367, T7F, T7C);
846
T7T = FMA(KP198912367, T7C, T7F);
851
E T7h, T8h, T7I, T7K, T7J, T8e, T7V, T8d, T8g, T8a, T7U;
853
T86 = FNMS(KP980785280, T85, T84);
854
T8h = FMA(KP980785280, T85, T84);
855
T81 = FNMS(KP980785280, T7H, T7s);
856
T7I = FMA(KP980785280, T7H, T7s);
861
T8b = FMA(KP980785280, T8a, T89);
862
T8e = FNMS(KP980785280, T8a, T89);
863
T7Y = FNMS(KP980785280, T7U, T7R);
864
T7V = FMA(KP980785280, T7U, T7R);
868
E T7X, T7W, T8i, T8f;
870
iio[0] = FMA(T7K, T7V, T7J);
875
rio[WS(ios, 31)] = FNMS(T7K, T7I, T7W);
876
rio[WS(ios, 23)] = FNMS(T8g, T8e, T8i);
877
iio[-WS(ios, 8)] = FMA(T8g, T8h, T8f);
890
iio[-WS(ios, 16)] = FMA(T80, T7Y, T82);
891
rio[WS(ios, 15)] = FNMS(T80, T81, T7Z);
894
iio[-WS(ios, 24)] = FMA(T88, T86, T8c);
895
rio[WS(ios, 7)] = FNMS(T88, T8b, T87);
900
static const tw_instr twinstr[] = {
905
static const hc2hc_desc desc = { 32, "hb_32", twinstr, &GENUS, {236, 62, 198, 0}, 0, 0, 0 };
907
void X(codelet_hb_32) (planner *p) {
908
X(khc2hc_register) (p, hb_32, &desc);
912
/* Generated by: ../../../genfft/gen_hc2hc -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -dif -name hb_32 -include hb.h */
915
* This function contains 434 FP additions, 208 FP multiplications,
916
* (or, 340 additions, 114 multiplications, 94 fused multiply/add),
917
* 98 stack variables, and 128 memory accesses
921
* $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
922
* $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
923
* $Id: gen_hc2hc.ml,v 1.16 2006-02-12 23:34:12 athena Exp $
928
static const R *hb_32(R *rio, R *iio, const R *W, stride ios, INT m, INT dist)
930
DK(KP555570233, +0.555570233019602224742830813948532874374937191);
931
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
932
DK(KP195090322, +0.195090322016128267848284868477022240927691618);
933
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
934
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
935
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
936
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
938
for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 62, MAKE_VOLATILE_STRIDE(ios)) {
939
E T5n, T6y, T77, T5u, Tf, T3i, T5x, T76, T3G, T47, T1a, T2I, T5k, T6z, T2o;
940
E T2Y, Tu, T3D, T6D, T73, T6G, T74, T1j, T2d, T1s, T2e, T55, T5z, T5c, T5A;
941
E T3l, T48, TK, T3n, T6L, T7t, T6O, T7s, T1D, T2L, T1M, T2M, T4w, T62, T4D;
942
E T61, T3q, T41, TZ, T3s, T6S, T7w, T6V, T7v, T1W, T2O, T25, T2P, T4P, T64;
943
E T4W, T65, T3v, T42;
945
E T3, T5l, T2j, T5t, T6, T5s, T2m, T5m, Ta, T5i, T15, T5h, Td, T5e, T18;
950
T2 = iio[-WS(ios, 16)];
954
T2i = rio[WS(ios, 16)];
960
T4 = rio[WS(ios, 8)];
961
T5 = iio[-WS(ios, 24)];
964
T2k = iio[-WS(ios, 8)];
965
T2l = rio[WS(ios, 24)];
971
T8 = rio[WS(ios, 4)];
972
T9 = iio[-WS(ios, 20)];
975
T13 = iio[-WS(ios, 4)];
976
T14 = rio[WS(ios, 20)];
982
Tb = iio[-WS(ios, 28)];
983
Tc = rio[WS(ios, 12)];
986
T16 = iio[-WS(ios, 12)];
987
T17 = rio[WS(ios, 28)];
1002
E T5v, T5w, T3E, T3F;
1005
T5x = KP707106781 * (T5v - T5w);
1006
T76 = KP707106781 * (T5v + T5w);
1017
E T5g, T5j, T2g, T2n;
1020
T5k = KP707106781 * (T5g - T5j);
1021
T6z = KP707106781 * (T5j + T5g);
1030
E Ti, T4Z, T1e, T53, Tl, T52, T1h, T50, Tp, T56, T1n, T5a, Ts, T59, T1q;
1034
Tg = rio[WS(ios, 2)];
1035
Th = iio[-WS(ios, 18)];
1038
T1c = iio[-WS(ios, 2)];
1039
T1d = rio[WS(ios, 18)];
1045
Tj = rio[WS(ios, 10)];
1046
Tk = iio[-WS(ios, 26)];
1049
T1f = iio[-WS(ios, 10)];
1050
T1g = rio[WS(ios, 26)];
1056
Tn = iio[-WS(ios, 30)];
1057
To = rio[WS(ios, 14)];
1060
T1l = iio[-WS(ios, 14)];
1061
T1m = rio[WS(ios, 30)];
1067
Tq = rio[WS(ios, 6)];
1068
Tr = iio[-WS(ios, 22)];
1071
T1o = iio[-WS(ios, 6)];
1072
T1p = rio[WS(ios, 22)];
1084
T6D = FNMS(KP382683432, T6C, KP923879532 * T6B);
1085
T73 = FMA(KP382683432, T6B, KP923879532 * T6C);
1088
E T6E, T6F, T1b, T1i;
1091
T6G = FNMS(KP923879532, T6F, KP382683432 * T6E);
1092
T74 = FMA(KP923879532, T6E, KP382683432 * T6F);
1099
E T1k, T1r, T51, T54;
1106
T55 = FNMS(KP382683432, T54, KP923879532 * T51);
1107
T5z = FMA(KP923879532, T54, KP382683432 * T51);
1110
E T58, T5b, T3j, T3k;
1113
T5c = FMA(KP923879532, T58, KP382683432 * T5b);
1114
T5A = FNMS(KP382683432, T58, KP923879532 * T5b);
1122
E Ty, T4t, T1H, T4y, TB, T4x, T1K, T4u, TI, T4B, T1B, T4o, TF, T4A, T1y;
1126
Tw = rio[WS(ios, 1)];
1127
Tx = iio[-WS(ios, 17)];
1132
T1F = iio[-WS(ios, 1)];
1133
T1G = rio[WS(ios, 17)];
1136
Tz = rio[WS(ios, 9)];
1137
TA = iio[-WS(ios, 25)];
1141
T1I = iio[-WS(ios, 9)];
1142
T1J = rio[WS(ios, 25)];
1146
E TG, TH, T4m, T1z, T1A, T4n;
1147
TG = iio[-WS(ios, 29)];
1148
TH = rio[WS(ios, 13)];
1150
T1z = iio[-WS(ios, 13)];
1151
T1A = rio[WS(ios, 29)];
1159
E TD, TE, T4q, T1w, T1x, T4p;
1160
TD = rio[WS(ios, 5)];
1161
TE = iio[-WS(ios, 21)];
1163
T1w = iio[-WS(ios, 5)];
1164
T1x = rio[WS(ios, 21)];
1179
T6K = KP707106781 * (T4r + T4o);
1184
E T6M, T6N, T1v, T1C;
1185
T6M = KP707106781 * (T4A + T4B);
1195
E T1E, T1L, T4s, T4v;
1200
T4s = KP707106781 * (T4o - T4r);
1206
E T4z, T4C, T3o, T3p;
1208
T4C = KP707106781 * (T4A - T4B);
1218
E TN, T4T, T20, T4N, TQ, T4M, T23, T4U, TX, T4Q, T1U, T4K, TU, T4R, T1R;
1222
TL = iio[-WS(ios, 31)];
1223
TM = rio[WS(ios, 15)];
1228
T1Y = iio[-WS(ios, 15)];
1229
T1Z = rio[WS(ios, 31)];
1232
TO = rio[WS(ios, 7)];
1233
TP = iio[-WS(ios, 23)];
1237
T21 = iio[-WS(ios, 7)];
1238
T22 = rio[WS(ios, 23)];
1242
E TV, TW, T4I, T1S, T1T, T4J;
1243
TV = iio[-WS(ios, 27)];
1244
TW = rio[WS(ios, 11)];
1246
T1S = iio[-WS(ios, 11)];
1247
T1T = rio[WS(ios, 27)];
1255
E TS, TT, T4F, T1P, T1Q, T4G;
1256
TS = rio[WS(ios, 3)];
1257
TT = iio[-WS(ios, 19)];
1259
T1P = iio[-WS(ios, 3)];
1260
T1Q = rio[WS(ios, 19)];
1274
T6Q = KP707106781 * (T4R + T4Q);
1280
E T6T, T6U, T1O, T1V;
1281
T6T = KP707106781 * (T4H + T4K);
1291
E T1X, T24, T4L, T4O;
1296
T4L = KP707106781 * (T4H - T4K);
1302
E T4S, T4V, T3t, T3u;
1303
T4S = KP707106781 * (T4Q - T4R);
1314
E Tv, T10, T4g, T4i, T4j, T4k, T4f, T4h;
1322
iio[-WS(ios, 31)] = T4j + T4i;
1325
rio[WS(ios, 16)] = FNMS(T4h, T4k, T4f * T4g);
1326
iio[-WS(ios, 15)] = FMA(T4h, T4g, T4f * T4k);
1329
E T44, T4c, T4a, T4e;
1331
E T40, T43, T46, T49;
1342
E T3Z, T45, T4b, T4d;
1345
rio[WS(ios, 24)] = FNMS(T45, T4a, T3Z * T44);
1346
iio[-WS(ios, 7)] = FMA(T45, T44, T3Z * T4a);
1349
rio[WS(ios, 8)] = FNMS(T4d, T4e, T4b * T4c);
1350
iio[-WS(ios, 23)] = FMA(T4d, T4c, T4b * T4e);
1354
E T3m, T3H, T3T, T3O, T3C, T3P, T3x, T3S;
1360
E T3A, T3B, T3r, T3w;
1363
T3C = KP707106781 * (T3A + T3B);
1364
T3P = KP707106781 * (T3B - T3A);
1367
T3x = KP707106781 * (T3r + T3w);
1368
T3S = KP707106781 * (T3r - T3w);
1371
E T3y, T3I, T3h, T3z;
1376
rio[WS(ios, 28)] = FNMS(T3z, T3I, T3h * T3y);
1377
iio[-WS(ios, 3)] = FMA(T3z, T3y, T3h * T3I);
1380
E T3W, T3Y, T3V, T3X;
1385
rio[WS(ios, 20)] = FNMS(T3X, T3Y, T3V * T3W);
1386
iio[-WS(ios, 11)] = FMA(T3X, T3W, T3V * T3Y);
1389
E T3K, T3M, T3J, T3L;
1394
rio[WS(ios, 12)] = FNMS(T3L, T3M, T3J * T3K);
1395
iio[-WS(ios, 19)] = FMA(T3L, T3K, T3J * T3M);
1398
E T3Q, T3U, T3N, T3R;
1403
rio[WS(ios, 4)] = FNMS(T3R, T3U, T3N * T3Q);
1404
iio[-WS(ios, 27)] = FMA(T3R, T3Q, T3N * T3U);
1408
E T2K, T36, T2Z, T3b, T2R, T3a, T2W, T37, T2J, T2X;
1409
T2J = KP707106781 * (T2e - T2d);
1412
T2X = KP707106781 * (T1j - T1s);
1416
E T2N, T2Q, T2U, T2V;
1417
T2N = FNMS(KP382683432, T2M, KP923879532 * T2L);
1418
T2Q = FMA(KP923879532, T2O, KP382683432 * T2P);
1421
T2U = FMA(KP382683432, T2L, KP923879532 * T2M);
1422
T2V = FNMS(KP382683432, T2O, KP923879532 * T2P);
1427
E T2S, T30, T2H, T2T;
1432
rio[WS(ios, 2)] = FNMS(T2T, T30, T2H * T2S);
1433
iio[-WS(ios, 29)] = FMA(T2T, T2S, T2H * T30);
1436
E T3e, T3g, T3d, T3f;
1441
rio[WS(ios, 10)] = FNMS(T3f, T3g, T3d * T3e);
1442
iio[-WS(ios, 21)] = FMA(T3f, T3e, T3d * T3g);
1445
E T32, T34, T31, T33;
1450
rio[WS(ios, 18)] = FNMS(T33, T34, T31 * T32);
1451
iio[-WS(ios, 13)] = FMA(T33, T32, T31 * T34);
1454
E T38, T3c, T35, T39;
1459
rio[WS(ios, 26)] = FNMS(T39, T3c, T35 * T38);
1460
iio[-WS(ios, 5)] = FMA(T39, T38, T35 * T3c);
1464
E T1u, T2w, T2p, T2B, T27, T2A, T2c, T2x, T1t, T2f;
1465
T1t = KP707106781 * (T1j + T1s);
1468
T2f = KP707106781 * (T2d + T2e);
1472
E T1N, T26, T2a, T2b;
1473
T1N = FMA(KP923879532, T1D, KP382683432 * T1M);
1474
T26 = FNMS(KP382683432, T25, KP923879532 * T1W);
1477
T2a = FNMS(KP382683432, T1D, KP923879532 * T1M);
1478
T2b = FMA(KP382683432, T1W, KP923879532 * T25);
1483
E T28, T2q, T11, T29;
1488
rio[WS(ios, 30)] = FNMS(T29, T2q, T11 * T28);
1489
iio[-WS(ios, 1)] = FMA(T29, T28, T11 * T2q);
1492
E T2E, T2G, T2D, T2F;
1497
rio[WS(ios, 22)] = FNMS(T2F, T2G, T2D * T2E);
1498
iio[-WS(ios, 9)] = FMA(T2F, T2E, T2D * T2G);
1501
E T2s, T2u, T2r, T2t;
1506
rio[WS(ios, 14)] = FNMS(T2t, T2u, T2r * T2s);
1507
iio[-WS(ios, 17)] = FMA(T2t, T2s, T2r * T2u);
1510
E T2y, T2C, T2v, T2z;
1515
rio[WS(ios, 6)] = FNMS(T2z, T2C, T2v * T2y);
1516
iio[-WS(ios, 25)] = FMA(T2z, T2y, T2v * T2C);
1520
E T4Y, T5N, T5F, T5Q, T5p, T5R, T5C, T5M;
1522
E T4E, T4X, T5D, T5E;
1523
T4E = FNMS(KP195090322, T4D, KP980785280 * T4w);
1524
T4X = FMA(KP195090322, T4P, KP980785280 * T4W);
1527
T5D = FMA(KP980785280, T4D, KP195090322 * T4w);
1528
T5E = FNMS(KP195090322, T4W, KP980785280 * T4P);
1533
E T5d, T5o, T5y, T5B;
1544
E T5q, T5G, T4l, T5r;
1549
rio[WS(ios, 1)] = FNMS(T5r, T5G, T4l * T5q);
1550
iio[-WS(ios, 30)] = FMA(T4l, T5G, T5r * T5q);
1553
E T5U, T5W, T5T, T5V;
1558
rio[WS(ios, 9)] = FNMS(T5V, T5W, T5T * T5U);
1559
iio[-WS(ios, 22)] = FMA(T5T, T5W, T5V * T5U);
1562
E T5I, T5K, T5H, T5J;
1567
iio[-WS(ios, 14)] = FMA(T5H, T5I, T5J * T5K);
1568
rio[WS(ios, 17)] = FNMS(T5J, T5I, T5H * T5K);
1571
E T5O, T5S, T5L, T5P;
1576
iio[-WS(ios, 6)] = FMA(T5L, T5O, T5P * T5S);
1577
rio[WS(ios, 25)] = FNMS(T5P, T5O, T5L * T5S);
1581
E T60, T6q, T6f, T6n, T67, T6m, T6c, T6r;
1583
E T5Y, T5Z, T6d, T6e;
1594
E T63, T66, T6a, T6b;
1595
T63 = FNMS(KP555570233, T62, KP831469612 * T61);
1596
T66 = FMA(KP831469612, T64, KP555570233 * T65);
1599
T6a = FMA(KP555570233, T61, KP831469612 * T62);
1600
T6b = FNMS(KP555570233, T64, KP831469612 * T65);
1605
E T68, T6g, T5X, T69;
1610
iio[-WS(ios, 2)] = FMA(T5X, T68, T69 * T6g);
1611
rio[WS(ios, 29)] = FNMS(T69, T68, T5X * T6g);
1614
E T6u, T6w, T6t, T6v;
1619
iio[-WS(ios, 10)] = FMA(T6t, T6u, T6v * T6w);
1620
rio[WS(ios, 21)] = FNMS(T6v, T6u, T6t * T6w);
1623
E T6i, T6k, T6h, T6j;
1628
rio[WS(ios, 13)] = FNMS(T6j, T6k, T6h * T6i);
1629
iio[-WS(ios, 18)] = FMA(T6h, T6k, T6j * T6i);
1632
E T6o, T6s, T6l, T6p;
1637
rio[WS(ios, 5)] = FNMS(T6p, T6s, T6l * T6o);
1638
iio[-WS(ios, 26)] = FMA(T6l, T6s, T6p * T6o);
1642
E T7y, T7R, T7J, T7U, T7B, T7V, T7G, T7Q;
1644
E T7u, T7x, T7H, T7I;
1645
T7u = FNMS(KP555570233, T7t, KP831469612 * T7s);
1646
T7x = FNMS(KP555570233, T7w, KP831469612 * T7v);
1649
T7H = FMA(KP831469612, T7t, KP555570233 * T7s);
1650
T7I = FMA(KP831469612, T7w, KP555570233 * T7v);
1655
E T7z, T7A, T7E, T7F;
1666
E T7C, T7K, T7r, T7D;
1671
rio[WS(ios, 3)] = FNMS(T7D, T7K, T7r * T7C);
1672
iio[-WS(ios, 28)] = FMA(T7r, T7K, T7D * T7C);
1675
E T7Y, T80, T7X, T7Z;
1680
rio[WS(ios, 11)] = FNMS(T7Z, T80, T7X * T7Y);
1681
iio[-WS(ios, 20)] = FMA(T7X, T80, T7Z * T7Y);
1684
E T7M, T7O, T7L, T7N;
1689
iio[-WS(ios, 12)] = FMA(T7L, T7M, T7N * T7O);
1690
rio[WS(ios, 19)] = FNMS(T7N, T7M, T7L * T7O);
1693
E T7S, T7W, T7P, T7T;
1698
iio[-WS(ios, 4)] = FMA(T7P, T7S, T7T * T7W);
1699
rio[WS(ios, 27)] = FNMS(T7T, T7S, T7P * T7W);
1703
E T6I, T7k, T79, T7h, T6X, T7g, T72, T7l;
1705
E T6A, T6H, T75, T78;
1716
E T6P, T6W, T70, T71;
1717
T6P = FNMS(KP195090322, T6O, KP980785280 * T6L);
1718
T6W = FMA(KP980785280, T6S, KP195090322 * T6V);
1721
T70 = FMA(KP195090322, T6L, KP980785280 * T6O);
1722
T71 = FNMS(KP195090322, T6S, KP980785280 * T6V);
1727
E T6Y, T7a, T6x, T6Z;
1732
iio[0] = FMA(T6x, T6Y, T6Z * T7a);
1733
rio[WS(ios, 31)] = FNMS(T6Z, T6Y, T6x * T7a);
1736
E T7o, T7q, T7n, T7p;
1741
iio[-WS(ios, 8)] = FMA(T7n, T7o, T7p * T7q);
1742
rio[WS(ios, 23)] = FNMS(T7p, T7o, T7n * T7q);
1745
E T7c, T7e, T7b, T7d;
1750
rio[WS(ios, 15)] = FNMS(T7d, T7e, T7b * T7c);
1751
iio[-WS(ios, 16)] = FMA(T7b, T7e, T7d * T7c);
1754
E T7i, T7m, T7f, T7j;
1759
rio[WS(ios, 7)] = FNMS(T7j, T7m, T7f * T7i);
1760
iio[-WS(ios, 24)] = FMA(T7f, T7m, T7j * T7i);
1767
static const tw_instr twinstr[] = {
1772
static const hc2hc_desc desc = { 32, "hb_32", twinstr, &GENUS, {340, 114, 94, 0}, 0, 0, 0 };
1774
void X(codelet_hb_32) (planner *p) {
1775
X(khc2hc_register) (p, hb_32, &desc);
1777
#endif /* HAVE_FMA */