~ubuntu-branches/ubuntu/utopic/fftw3/utopic

« back to all changes in this revision

Viewing changes to rdft/simd/codelets/hc2cfdftv_32.c

  • Committer: Package Import Robot
  • Author(s): Matthias Klose
  • Date: 2011-12-14 13:21:22 UTC
  • mfrom: (3.1.5 sid)
  • Revision ID: package-import@ubuntu.com-20111214132122-l4avyl2kkr7vq5aj
Tags: 3.3-1ubuntu1
* Merge with Debian; remaining changes:
  - Revert the ARM workaround.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*
2
 
 * Copyright (c) 2003, 2007-8 Matteo Frigo
3
 
 * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
4
 
 *
5
 
 * This program is free software; you can redistribute it and/or modify
6
 
 * it under the terms of the GNU General Public License as published by
7
 
 * the Free Software Foundation; either version 2 of the License, or
8
 
 * (at your option) any later version.
9
 
 *
10
 
 * This program is distributed in the hope that it will be useful,
11
 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 
 * GNU General Public License for more details.
14
 
 *
15
 
 * You should have received a copy of the GNU General Public License
16
 
 * along with this program; if not, write to the Free Software
17
 
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 
 *
19
 
 */
20
 
 
21
 
/* This file was automatically generated --- DO NOT EDIT */
22
 
/* Generated on Sun Jul 12 06:47:45 EDT 2009 */
23
 
 
24
 
#include "codelet-rdft.h"
25
 
 
26
 
#ifdef HAVE_FMA
27
 
 
28
 
/* Generated by: ../../../genfft/gen_hc2cdft_c -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h */
29
 
 
30
 
/*
31
 
 * This function contains 249 FP additions, 224 FP multiplications,
32
 
 * (or, 119 additions, 94 multiplications, 130 fused multiply/add),
33
 
 * 167 stack variables, 8 constants, and 64 memory accesses
34
 
 */
35
 
#include "hc2cfv.h"
36
 
 
37
 
static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38
 
{
39
 
     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
40
 
     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
41
 
     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
42
 
     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
43
 
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
44
 
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
45
 
     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
46
 
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
47
 
     INT m;
48
 
     for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(rs)) {
49
 
          V T2m, T2b, T2c, T2d, T2v, T2r, T20, T2i, T2n, T2e, T2o, T2u, T2j, T2f, T2t;
50
 
          V T2s, T2x, T2w, T2l, T2k, T2h, T2g;
51
 
          {
52
 
               V T41, T3B, T40, T3a, T2J, T27, T2y, Ts, T2C, T1X, T2B, T1Q, T3F, T3w, T4l;
53
 
               V T49, T1b, T1s, T3c, TB, T1f, T3g, T44, T1l, T3k, T3o, T4b, T28, T14, T1d;
54
 
               V T3b, TK;
55
 
               {
56
 
                    V T1V, T1E, T3A, Th, T3v, T47, T1J, T3q, T8, T38, T25, T39, T3z, Tq, T1O;
57
 
                    V T3r, T3, T7, T3u, T24, T22, T3t, T1I, Tn, T1G, To, Tm, T1K, Tl, T1N;
58
 
                    V Tp, T1L, TU, T3f, T3m, T13, T3e, T3n, T1i, TH, TI, T1k, TG, TF, T1c;
59
 
                    V TJ;
60
 
                    {
61
 
                         V T1x, T1y, T1U, T1B, T1S, T1C, T1A, T23, T21, T1z, T1, T2, T1T, T5, T6;
62
 
                         V T1R, T4, T1w, Ta, Tb, T1H, Te, Tf, Td, Tc, T1F, T9, T1D, Tj, Tk;
63
 
                         V Ti, Tg, T1M;
64
 
                         T1 = LD(&(Rp[0]), ms, &(Rp[0]));
65
 
                         T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
66
 
                         T1T = LDW(&(W[0]));
67
 
                         T5 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
68
 
                         T6 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
69
 
                         T1R = LDW(&(W[TWVL * 32]));
70
 
                         T4 = LDW(&(W[TWVL * 30]));
71
 
                         T1x = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
72
 
                         T1y = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
73
 
                         T3 = VFMACONJ(T2, T1);
74
 
                         T1U = VZMULIJ(T1T, VFNMSCONJ(T2, T1));
75
 
                         T1w = LDW(&(W[TWVL * 48]));
76
 
                         T1B = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
77
 
                         T1S = VZMULIJ(T1R, VFNMSCONJ(T6, T5));
78
 
                         T7 = VZMULJ(T4, VFMACONJ(T6, T5));
79
 
                         T1C = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
80
 
                         T1A = LDW(&(W[TWVL * 16]));
81
 
                         T23 = LDW(&(W[TWVL * 46]));
82
 
                         T21 = LDW(&(W[TWVL * 14]));
83
 
                         T1z = VZMULIJ(T1w, VFNMSCONJ(T1y, T1x));
84
 
                         Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
85
 
                         T3u = VADD(T1U, T1S);
86
 
                         T1V = VSUB(T1S, T1U);
87
 
                         Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
88
 
                         T9 = LDW(&(W[TWVL * 6]));
89
 
                         T1D = VZMULIJ(T1A, VFNMSCONJ(T1C, T1B));
90
 
                         T24 = VZMULJ(T23, VFMACONJ(T1y, T1x));
91
 
                         T22 = VZMULJ(T21, VFMACONJ(T1C, T1B));
92
 
                         T1H = LDW(&(W[TWVL * 8]));
93
 
                         Te = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
94
 
                         Tf = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
95
 
                         Td = LDW(&(W[TWVL * 38]));
96
 
                         Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
97
 
                         T1E = VSUB(T1z, T1D);
98
 
                         T3t = VADD(T1D, T1z);
99
 
                         T1F = LDW(&(W[TWVL * 40]));
100
 
                         Tj = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
101
 
                         T1I = VZMULIJ(T1H, VFNMSCONJ(Tb, Ta));
102
 
                         Tk = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
103
 
                         Ti = LDW(&(W[TWVL * 54]));
104
 
                         Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
105
 
                         T1M = LDW(&(W[TWVL * 56]));
106
 
                         Tn = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
107
 
                         T1G = VZMULIJ(T1F, VFNMSCONJ(Tf, Te));
108
 
                         To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
109
 
                         Tm = LDW(&(W[TWVL * 22]));
110
 
                         T1K = LDW(&(W[TWVL * 24]));
111
 
                         Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
112
 
                         T3A = VADD(Tc, Tg);
113
 
                         Th = VSUB(Tc, Tg);
114
 
                         T1N = VZMULIJ(T1M, VFNMSCONJ(Tk, Tj));
115
 
                    }
116
 
                    T3v = VSUB(T3t, T3u);
117
 
                    T47 = VADD(T3u, T3t);
118
 
                    T1J = VSUB(T1G, T1I);
119
 
                    T3q = VADD(T1I, T1G);
120
 
                    Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
121
 
                    T1L = VZMULIJ(T1K, VFNMSCONJ(To, Tn));
122
 
                    T8 = VSUB(T3, T7);
123
 
                    T38 = VADD(T3, T7);
124
 
                    T25 = VSUB(T22, T24);
125
 
                    T39 = VADD(T22, T24);
126
 
                    T3z = VADD(Tl, Tp);
127
 
                    Tq = VSUB(Tl, Tp);
128
 
                    T1O = VSUB(T1L, T1N);
129
 
                    T3r = VADD(T1N, T1L);
130
 
                    {
131
 
                         V T10, T11, TZ, T1o, TY, T1r, TN, TO, TM, T19, TR, TS, TQ, T17, T26;
132
 
                         V Tr, T1W, T1P, T3s, T48, TW, TX, TP, T1a, TV, T1q, TT, T18, Ty, Tz;
133
 
                         V Tx, Tw, T1j, Tu, T12, T1p, Tv, Tt, T1h, TD, TA, TE, TC, T1e;
134
 
                         TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
135
 
                         TO = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
136
 
                         T41 = VADD(T3A, T3z);
137
 
                         T3B = VSUB(T3z, T3A);
138
 
                         T26 = VSUB(Tq, Th);
139
 
                         Tr = VADD(Th, Tq);
140
 
                         T1W = VADD(T1J, T1O);
141
 
                         T1P = VSUB(T1J, T1O);
142
 
                         T3s = VSUB(T3q, T3r);
143
 
                         T48 = VADD(T3q, T3r);
144
 
                         T40 = VADD(T38, T39);
145
 
                         T3a = VSUB(T38, T39);
146
 
                         T2J = VFNMS(LDK(KP707106781), T26, T25);
147
 
                         T27 = VFMA(LDK(KP707106781), T26, T25);
148
 
                         T2y = VFMA(LDK(KP707106781), Tr, T8);
149
 
                         Ts = VFNMS(LDK(KP707106781), Tr, T8);
150
 
                         T2C = VFMA(LDK(KP707106781), T1W, T1V);
151
 
                         T1X = VFNMS(LDK(KP707106781), T1W, T1V);
152
 
                         T2B = VFMA(LDK(KP707106781), T1P, T1E);
153
 
                         T1Q = VFNMS(LDK(KP707106781), T1P, T1E);
154
 
                         T3F = VFMA(LDK(KP414213562), T3s, T3v);
155
 
                         T3w = VFNMS(LDK(KP414213562), T3v, T3s);
156
 
                         T4l = VSUB(T48, T47);
157
 
                         T49 = VADD(T47, T48);
158
 
                         TM = LDW(&(W[TWVL * 10]));
159
 
                         T19 = LDW(&(W[TWVL * 12]));
160
 
                         TR = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
161
 
                         TS = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
162
 
                         TQ = LDW(&(W[TWVL * 42]));
163
 
                         T17 = LDW(&(W[TWVL * 44]));
164
 
                         TW = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
165
 
                         TX = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
166
 
                         TP = VZMULJ(TM, VFMACONJ(TO, TN));
167
 
                         T1a = VZMULIJ(T19, VFNMSCONJ(TO, TN));
168
 
                         TV = LDW(&(W[TWVL * 58]));
169
 
                         T1q = LDW(&(W[TWVL * 60]));
170
 
                         TT = VZMULJ(TQ, VFMACONJ(TS, TR));
171
 
                         T18 = VZMULIJ(T17, VFNMSCONJ(TS, TR));
172
 
                         T10 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
173
 
                         T11 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
174
 
                         TZ = LDW(&(W[TWVL * 26]));
175
 
                         T1o = LDW(&(W[TWVL * 28]));
176
 
                         TY = VZMULJ(TV, VFMACONJ(TX, TW));
177
 
                         T1r = VZMULIJ(T1q, VFNMSCONJ(TX, TW));
178
 
                         TU = VSUB(TP, TT);
179
 
                         T3f = VADD(TP, TT);
180
 
                         T1b = VSUB(T18, T1a);
181
 
                         T3m = VADD(T1a, T18);
182
 
                         Tu = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
183
 
                         T12 = VZMULJ(TZ, VFMACONJ(T11, T10));
184
 
                         T1p = VZMULIJ(T1o, VFNMSCONJ(T11, T10));
185
 
                         Tv = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
186
 
                         Tt = LDW(&(W[TWVL * 18]));
187
 
                         T1h = LDW(&(W[TWVL * 20]));
188
 
                         Ty = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
189
 
                         Tz = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
190
 
                         Tx = LDW(&(W[TWVL * 50]));
191
 
                         T13 = VSUB(TY, T12);
192
 
                         T3e = VADD(TY, T12);
193
 
                         T1s = VSUB(T1p, T1r);
194
 
                         T3n = VADD(T1r, T1p);
195
 
                         Tw = VZMULJ(Tt, VFMACONJ(Tv, Tu));
196
 
                         T1i = VZMULIJ(T1h, VFNMSCONJ(Tv, Tu));
197
 
                         T1j = LDW(&(W[TWVL * 52]));
198
 
                         TD = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
199
 
                         TA = VZMULJ(Tx, VFMACONJ(Tz, Ty));
200
 
                         TE = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
201
 
                         TC = LDW(&(W[TWVL * 2]));
202
 
                         T1e = LDW(&(W[TWVL * 4]));
203
 
                         TH = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
204
 
                         TI = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
205
 
                         T1k = VZMULIJ(T1j, VFNMSCONJ(Tz, Ty));
206
 
                         TG = LDW(&(W[TWVL * 34]));
207
 
                         T3c = VADD(Tw, TA);
208
 
                         TB = VSUB(Tw, TA);
209
 
                         TF = VZMULJ(TC, VFMACONJ(TE, TD));
210
 
                         T1f = VZMULIJ(T1e, VFNMSCONJ(TE, TD));
211
 
                         T1c = LDW(&(W[TWVL * 36]));
212
 
                    }
213
 
                    T3g = VSUB(T3e, T3f);
214
 
                    T44 = VADD(T3e, T3f);
215
 
                    T1l = VSUB(T1i, T1k);
216
 
                    T3k = VADD(T1i, T1k);
217
 
                    TJ = VZMULJ(TG, VFMACONJ(TI, TH));
218
 
                    T3o = VSUB(T3m, T3n);
219
 
                    T4b = VADD(T3n, T3m);
220
 
                    T28 = VFMA(LDK(KP414213562), TU, T13);
221
 
                    T14 = VFNMS(LDK(KP414213562), T13, TU);
222
 
                    T1d = VZMULIJ(T1c, VFNMSCONJ(TI, TH));
223
 
                    T3b = VADD(TF, TJ);
224
 
                    TK = VSUB(TF, TJ);
225
 
               }
226
 
               {
227
 
                    V T4k, T4p, T2z, T2a, T2K, T15, T2E, T1n, T2F, T1u, T4c, T3R, T3D, T3i, T3O;
228
 
                    V T46, T4g, T3G, T3P, T3S, T3x, T4q, T4n, T42, T1g, T3j, T3E, T3p, T4m, T3d;
229
 
                    V T43, T29, TL, T1m, T1t, T3l, T4a, T3C, T3h, T45, T3Q, T3W, T4d, T4h, T3H;
230
 
                    V T3L, T3y, T3K, T4r, T4v, T4o, T4u, T4j, T4i, T4e, T4f, T3N, T3M, T3I, T3J;
231
 
                    V T4x, T4w, T4s, T4t;
232
 
                    T42 = VADD(T40, T41);
233
 
                    T4k = VSUB(T40, T41);
234
 
                    T1g = VSUB(T1d, T1f);
235
 
                    T3j = VADD(T1f, T1d);
236
 
                    T3d = VSUB(T3b, T3c);
237
 
                    T43 = VADD(T3b, T3c);
238
 
                    T29 = VFNMS(LDK(KP414213562), TB, TK);
239
 
                    TL = VFMA(LDK(KP414213562), TK, TB);
240
 
                    T1m = VSUB(T1g, T1l);
241
 
                    T1t = VADD(T1g, T1l);
242
 
                    T3l = VSUB(T3j, T3k);
243
 
                    T4a = VADD(T3j, T3k);
244
 
                    T3C = VSUB(T3g, T3d);
245
 
                    T3h = VADD(T3d, T3g);
246
 
                    T45 = VADD(T43, T44);
247
 
                    T4p = VSUB(T44, T43);
248
 
                    T2z = VADD(T29, T28);
249
 
                    T2a = VSUB(T28, T29);
250
 
                    T2K = VADD(TL, T14);
251
 
                    T15 = VSUB(TL, T14);
252
 
                    T2E = VFMA(LDK(KP707106781), T1m, T1b);
253
 
                    T1n = VFNMS(LDK(KP707106781), T1m, T1b);
254
 
                    T2F = VFMA(LDK(KP707106781), T1t, T1s);
255
 
                    T1u = VFNMS(LDK(KP707106781), T1t, T1s);
256
 
                    T3E = VFNMS(LDK(KP414213562), T3l, T3o);
257
 
                    T3p = VFMA(LDK(KP414213562), T3o, T3l);
258
 
                    T4m = VSUB(T4a, T4b);
259
 
                    T4c = VADD(T4a, T4b);
260
 
                    T3R = VFMA(LDK(KP707106781), T3C, T3B);
261
 
                    T3D = VFNMS(LDK(KP707106781), T3C, T3B);
262
 
                    T3i = VFNMS(LDK(KP707106781), T3h, T3a);
263
 
                    T3O = VFMA(LDK(KP707106781), T3h, T3a);
264
 
                    T46 = VSUB(T42, T45);
265
 
                    T4g = VADD(T42, T45);
266
 
                    T3G = VSUB(T3E, T3F);
267
 
                    T3P = VADD(T3F, T3E);
268
 
                    T3S = VADD(T3w, T3p);
269
 
                    T3x = VSUB(T3p, T3w);
270
 
                    T4q = VSUB(T4m, T4l);
271
 
                    T4n = VADD(T4l, T4m);
272
 
                    T4d = VSUB(T49, T4c);
273
 
                    T4h = VADD(T49, T4c);
274
 
                    T3H = VFNMS(LDK(KP923879532), T3G, T3D);
275
 
                    T3L = VFMA(LDK(KP923879532), T3G, T3D);
276
 
                    T3y = VFMA(LDK(KP923879532), T3x, T3i);
277
 
                    T3K = VFNMS(LDK(KP923879532), T3x, T3i);
278
 
                    T4r = VFMA(LDK(KP707106781), T4q, T4p);
279
 
                    T4v = VFNMS(LDK(KP707106781), T4q, T4p);
280
 
                    T4o = VFMA(LDK(KP707106781), T4n, T4k);
281
 
                    T4u = VFNMS(LDK(KP707106781), T4n, T4k);
282
 
                    T3Q = VFMA(LDK(KP923879532), T3P, T3O);
283
 
                    T3W = VFNMS(LDK(KP923879532), T3P, T3O);
284
 
                    T4j = VCONJ(VMUL(LDK(KP500000000), VADD(T4h, T4g)));
285
 
                    T4i = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
286
 
                    T4e = VMUL(LDK(KP500000000), VFMAI(T4d, T46));
287
 
                    T4f = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4d, T46)));
288
 
                    T3N = VMUL(LDK(KP500000000), VFMAI(T3L, T3K));
289
 
                    T3M = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3L, T3K)));
290
 
                    T3I = VMUL(LDK(KP500000000), VFNMSI(T3H, T3y));
291
 
                    T3J = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3H, T3y)));
292
 
                    T4x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T4v, T4u)));
293
 
                    T4w = VMUL(LDK(KP500000000), VFNMSI(T4v, T4u));
294
 
                    T4s = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4r, T4o)));
295
 
                    T4t = VMUL(LDK(KP500000000), VFMAI(T4r, T4o));
296
 
                    ST(&(Rp[0]), T4i, ms, &(Rp[0]));
297
 
                    ST(&(Rm[WS(rs, 15)]), T4j, -ms, &(Rm[WS(rs, 1)]));
298
 
                    ST(&(Rm[WS(rs, 7)]), T4f, -ms, &(Rm[WS(rs, 1)]));
299
 
                    ST(&(Rp[WS(rs, 8)]), T4e, ms, &(Rp[0]));
300
 
                    ST(&(Rm[WS(rs, 9)]), T3M, -ms, &(Rm[WS(rs, 1)]));
301
 
                    ST(&(Rp[WS(rs, 10)]), T3N, ms, &(Rp[0]));
302
 
                    ST(&(Rm[WS(rs, 5)]), T3J, -ms, &(Rm[WS(rs, 1)]));
303
 
                    ST(&(Rp[WS(rs, 6)]), T3I, ms, &(Rp[0]));
304
 
                    ST(&(Rp[WS(rs, 12)]), T4w, ms, &(Rp[0]));
305
 
                    ST(&(Rm[WS(rs, 11)]), T4x, -ms, &(Rm[WS(rs, 1)]));
306
 
                    ST(&(Rp[WS(rs, 4)]), T4t, ms, &(Rp[0]));
307
 
                    ST(&(Rm[WS(rs, 3)]), T4s, -ms, &(Rm[WS(rs, 1)]));
308
 
                    {
309
 
                         V T2A, T2W, T2L, T2Z, T2D, T2N, T2M, T2G, T3T, T3X, T16, T2p, T1v, T35, T31;
310
 
                         V T2I, T2S, T34, T2Y, T2P, T2T, T1Y, T2H, T30, T3Z, T3Y, T3U, T3V, T2O, T2X;
311
 
                         V T32, T33, T36, T37, T2U, T2V, T2Q, T2R, T1Z, T2q;
312
 
                         T2A = VFNMS(LDK(KP923879532), T2z, T2y);
313
 
                         T2W = VFMA(LDK(KP923879532), T2z, T2y);
314
 
                         T2L = VFNMS(LDK(KP923879532), T2K, T2J);
315
 
                         T2Z = VFMA(LDK(KP923879532), T2K, T2J);
316
 
                         T2D = VFMA(LDK(KP198912367), T2C, T2B);
317
 
                         T2N = VFNMS(LDK(KP198912367), T2B, T2C);
318
 
                         T2M = VFMA(LDK(KP198912367), T2E, T2F);
319
 
                         T2G = VFNMS(LDK(KP198912367), T2F, T2E);
320
 
                         T3T = VFMA(LDK(KP923879532), T3S, T3R);
321
 
                         T3X = VFNMS(LDK(KP923879532), T3S, T3R);
322
 
                         T16 = VFNMS(LDK(KP923879532), T15, Ts);
323
 
                         T2m = VFMA(LDK(KP923879532), T15, Ts);
324
 
                         T2H = VSUB(T2D, T2G);
325
 
                         T30 = VADD(T2D, T2G);
326
 
                         T2b = VFNMS(LDK(KP923879532), T2a, T27);
327
 
                         T2p = VFMA(LDK(KP923879532), T2a, T27);
328
 
                         T1v = VFMA(LDK(KP668178637), T1u, T1n);
329
 
                         T2c = VFNMS(LDK(KP668178637), T1n, T1u);
330
 
                         T3Z = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3X, T3W)));
331
 
                         T3Y = VMUL(LDK(KP500000000), VFNMSI(T3X, T3W));
332
 
                         T3U = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3T, T3Q)));
333
 
                         T3V = VMUL(LDK(KP500000000), VFMAI(T3T, T3Q));
334
 
                         T2O = VSUB(T2M, T2N);
335
 
                         T2X = VADD(T2N, T2M);
336
 
                         T35 = VFNMS(LDK(KP980785280), T30, T2Z);
337
 
                         T31 = VFMA(LDK(KP980785280), T30, T2Z);
338
 
                         T2I = VFMA(LDK(KP980785280), T2H, T2A);
339
 
                         T2S = VFNMS(LDK(KP980785280), T2H, T2A);
340
 
                         ST(&(Rp[WS(rs, 14)]), T3Y, ms, &(Rp[0]));
341
 
                         ST(&(Rm[WS(rs, 13)]), T3Z, -ms, &(Rm[WS(rs, 1)]));
342
 
                         ST(&(Rp[WS(rs, 2)]), T3V, ms, &(Rp[0]));
343
 
                         ST(&(Rm[WS(rs, 1)]), T3U, -ms, &(Rm[WS(rs, 1)]));
344
 
                         T34 = VFNMS(LDK(KP980785280), T2X, T2W);
345
 
                         T2Y = VFMA(LDK(KP980785280), T2X, T2W);
346
 
                         T2P = VFMA(LDK(KP980785280), T2O, T2L);
347
 
                         T2T = VFNMS(LDK(KP980785280), T2O, T2L);
348
 
                         T2d = VFMA(LDK(KP668178637), T1Q, T1X);
349
 
                         T1Y = VFNMS(LDK(KP668178637), T1X, T1Q);
350
 
                         T32 = VMUL(LDK(KP500000000), VFNMSI(T31, T2Y));
351
 
                         T33 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T31, T2Y)));
352
 
                         T36 = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T35, T34)));
353
 
                         T37 = VMUL(LDK(KP500000000), VFMAI(T35, T34));
354
 
                         T2U = VMUL(LDK(KP500000000), VFNMSI(T2T, T2S));
355
 
                         T2V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2T, T2S)));
356
 
                         T2Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2P, T2I)));
357
 
                         T2R = VMUL(LDK(KP500000000), VFMAI(T2P, T2I));
358
 
                         T1Z = VSUB(T1v, T1Y);
359
 
                         T2q = VADD(T1Y, T1v);
360
 
                         ST(&(Rm[0]), T33, -ms, &(Rm[0]));
361
 
                         ST(&(Rp[WS(rs, 1)]), T32, ms, &(Rp[WS(rs, 1)]));
362
 
                         ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
363
 
                         ST(&(Rm[WS(rs, 14)]), T36, -ms, &(Rm[0]));
364
 
                         ST(&(Rm[WS(rs, 8)]), T2V, -ms, &(Rm[0]));
365
 
                         ST(&(Rp[WS(rs, 9)]), T2U, ms, &(Rp[WS(rs, 1)]));
366
 
                         ST(&(Rp[WS(rs, 7)]), T2R, ms, &(Rp[WS(rs, 1)]));
367
 
                         ST(&(Rm[WS(rs, 6)]), T2Q, -ms, &(Rm[0]));
368
 
                         T2v = VFNMS(LDK(KP831469612), T2q, T2p);
369
 
                         T2r = VFMA(LDK(KP831469612), T2q, T2p);
370
 
                         T20 = VFMA(LDK(KP831469612), T1Z, T16);
371
 
                         T2i = VFNMS(LDK(KP831469612), T1Z, T16);
372
 
                    }
373
 
               }
374
 
          }
375
 
          T2n = VADD(T2d, T2c);
376
 
          T2e = VSUB(T2c, T2d);
377
 
          T2o = VFMA(LDK(KP831469612), T2n, T2m);
378
 
          T2u = VFNMS(LDK(KP831469612), T2n, T2m);
379
 
          T2j = VFMA(LDK(KP831469612), T2e, T2b);
380
 
          T2f = VFNMS(LDK(KP831469612), T2e, T2b);
381
 
          T2t = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2r, T2o)));
382
 
          T2s = VMUL(LDK(KP500000000), VFMAI(T2r, T2o));
383
 
          T2x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2v, T2u)));
384
 
          T2w = VMUL(LDK(KP500000000), VFNMSI(T2v, T2u));
385
 
          T2l = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2j, T2i)));
386
 
          T2k = VMUL(LDK(KP500000000), VFMAI(T2j, T2i));
387
 
          T2h = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2f, T20)));
388
 
          T2g = VMUL(LDK(KP500000000), VFNMSI(T2f, T20));
389
 
          ST(&(Rm[WS(rs, 2)]), T2t, -ms, &(Rm[0]));
390
 
          ST(&(Rp[WS(rs, 3)]), T2s, ms, &(Rp[WS(rs, 1)]));
391
 
          ST(&(Rm[WS(rs, 12)]), T2x, -ms, &(Rm[0]));
392
 
          ST(&(Rp[WS(rs, 13)]), T2w, ms, &(Rp[WS(rs, 1)]));
393
 
          ST(&(Rm[WS(rs, 10)]), T2l, -ms, &(Rm[0]));
394
 
          ST(&(Rp[WS(rs, 11)]), T2k, ms, &(Rp[WS(rs, 1)]));
395
 
          ST(&(Rm[WS(rs, 4)]), T2h, -ms, &(Rm[0]));
396
 
          ST(&(Rp[WS(rs, 5)]), T2g, ms, &(Rp[WS(rs, 1)]));
397
 
     }
398
 
}
399
 
 
400
 
static const tw_instr twinstr[] = {
401
 
     VTW(1, 1),
402
 
     VTW(1, 2),
403
 
     VTW(1, 3),
404
 
     VTW(1, 4),
405
 
     VTW(1, 5),
406
 
     VTW(1, 6),
407
 
     VTW(1, 7),
408
 
     VTW(1, 8),
409
 
     VTW(1, 9),
410
 
     VTW(1, 10),
411
 
     VTW(1, 11),
412
 
     VTW(1, 12),
413
 
     VTW(1, 13),
414
 
     VTW(1, 14),
415
 
     VTW(1, 15),
416
 
     VTW(1, 16),
417
 
     VTW(1, 17),
418
 
     VTW(1, 18),
419
 
     VTW(1, 19),
420
 
     VTW(1, 20),
421
 
     VTW(1, 21),
422
 
     VTW(1, 22),
423
 
     VTW(1, 23),
424
 
     VTW(1, 24),
425
 
     VTW(1, 25),
426
 
     VTW(1, 26),
427
 
     VTW(1, 27),
428
 
     VTW(1, 28),
429
 
     VTW(1, 29),
430
 
     VTW(1, 30),
431
 
     VTW(1, 31),
432
 
     {TW_NEXT, VL, 0}
433
 
};
434
 
 
435
 
static const hc2c_desc desc = { 32, "hc2cfdftv_32", twinstr, &GENUS, {119, 94, 130, 0} };
436
 
 
437
 
void X(codelet_hc2cfdftv_32) (planner *p) {
438
 
     X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
439
 
}
440
 
#else                           /* HAVE_FMA */
441
 
 
442
 
/* Generated by: ../../../genfft/gen_hc2cdft_c -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h */
443
 
 
444
 
/*
445
 
 * This function contains 249 FP additions, 133 FP multiplications,
446
 
 * (or, 233 additions, 117 multiplications, 16 fused multiply/add),
447
 
 * 130 stack variables, 9 constants, and 64 memory accesses
448
 
 */
449
 
#include "hc2cfv.h"
450
 
 
451
 
static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
452
 
{
453
 
     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
454
 
     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
455
 
     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
456
 
     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
457
 
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
458
 
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
459
 
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
460
 
     DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
461
 
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
462
 
     INT m;
463
 
     for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(rs)) {
464
 
          V Ta, T2m, Tx, T2h, T3R, T4h, T3q, T4g, T3B, T4n, T3E, T4o, T1B, T2S, T1O;
465
 
          V T2R, TV, T2p, T1i, T2o, T3L, T4q, T3I, T4r, T3w, T4k, T3t, T4j, T26, T2V;
466
 
          V T2d, T2U;
467
 
          {
468
 
               V T4, T1m, T1H, T2j, T1M, T2l, T9, T1o, Tf, T1r, Tq, T1w, Tv, T1y, Tk;
469
 
               V T1t, Tl, Tw, T3P, T3Q, T3o, T3p, T3z, T3A, T3C, T3D, T1p, T1N, T1A, T1C;
470
 
               V T1u, T1z;
471
 
               {
472
 
                    V T1, T3, T2, T1l, T1G, T1F, T1E, T1D, T2i, T1L, T1K, T1J, T1I, T2k, T6;
473
 
                    V T8, T7, T5, T1n, Tc, Te, Td, Tb, T1q, Tn, Tp, To, Tm, T1v, Ts;
474
 
                    V Tu, Tt, Tr, T1x, Th, Tj, Ti, Tg, T1s;
475
 
                    T1 = LD(&(Rp[0]), ms, &(Rp[0]));
476
 
                    T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
477
 
                    T3 = VCONJ(T2);
478
 
                    T4 = VADD(T1, T3);
479
 
                    T1l = LDW(&(W[0]));
480
 
                    T1m = VZMULIJ(T1l, VSUB(T3, T1));
481
 
                    T1G = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
482
 
                    T1E = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
483
 
                    T1F = VCONJ(T1E);
484
 
                    T1D = LDW(&(W[TWVL * 16]));
485
 
                    T1H = VZMULIJ(T1D, VSUB(T1F, T1G));
486
 
                    T2i = LDW(&(W[TWVL * 14]));
487
 
                    T2j = VZMULJ(T2i, VADD(T1G, T1F));
488
 
                    T1L = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
489
 
                    T1J = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
490
 
                    T1K = VCONJ(T1J);
491
 
                    T1I = LDW(&(W[TWVL * 48]));
492
 
                    T1M = VZMULIJ(T1I, VSUB(T1K, T1L));
493
 
                    T2k = LDW(&(W[TWVL * 46]));
494
 
                    T2l = VZMULJ(T2k, VADD(T1L, T1K));
495
 
                    T6 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
496
 
                    T7 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
497
 
                    T8 = VCONJ(T7);
498
 
                    T5 = LDW(&(W[TWVL * 30]));
499
 
                    T9 = VZMULJ(T5, VADD(T6, T8));
500
 
                    T1n = LDW(&(W[TWVL * 32]));
501
 
                    T1o = VZMULIJ(T1n, VSUB(T8, T6));
502
 
                    Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
503
 
                    Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
504
 
                    Te = VCONJ(Td);
505
 
                    Tb = LDW(&(W[TWVL * 6]));
506
 
                    Tf = VZMULJ(Tb, VADD(Tc, Te));
507
 
                    T1q = LDW(&(W[TWVL * 8]));
508
 
                    T1r = VZMULIJ(T1q, VSUB(Te, Tc));
509
 
                    Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
510
 
                    To = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
511
 
                    Tp = VCONJ(To);
512
 
                    Tm = LDW(&(W[TWVL * 54]));
513
 
                    Tq = VZMULJ(Tm, VADD(Tn, Tp));
514
 
                    T1v = LDW(&(W[TWVL * 56]));
515
 
                    T1w = VZMULIJ(T1v, VSUB(Tp, Tn));
516
 
                    Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
517
 
                    Tt = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
518
 
                    Tu = VCONJ(Tt);
519
 
                    Tr = LDW(&(W[TWVL * 22]));
520
 
                    Tv = VZMULJ(Tr, VADD(Ts, Tu));
521
 
                    T1x = LDW(&(W[TWVL * 24]));
522
 
                    T1y = VZMULIJ(T1x, VSUB(Tu, Ts));
523
 
                    Th = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
524
 
                    Ti = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
525
 
                    Tj = VCONJ(Ti);
526
 
                    Tg = LDW(&(W[TWVL * 38]));
527
 
                    Tk = VZMULJ(Tg, VADD(Th, Tj));
528
 
                    T1s = LDW(&(W[TWVL * 40]));
529
 
                    T1t = VZMULIJ(T1s, VSUB(Tj, Th));
530
 
               }
531
 
               Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
532
 
               T2m = VSUB(T2j, T2l);
533
 
               Tl = VSUB(Tf, Tk);
534
 
               Tw = VSUB(Tq, Tv);
535
 
               Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
536
 
               T2h = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
537
 
               T3P = VADD(Tq, Tv);
538
 
               T3Q = VADD(Tf, Tk);
539
 
               T3R = VSUB(T3P, T3Q);
540
 
               T4h = VADD(T3Q, T3P);
541
 
               T3o = VADD(T4, T9);
542
 
               T3p = VADD(T2j, T2l);
543
 
               T3q = VMUL(LDK(KP500000000), VSUB(T3o, T3p));
544
 
               T4g = VADD(T3o, T3p);
545
 
               T3z = VADD(T1m, T1o);
546
 
               T3A = VADD(T1H, T1M);
547
 
               T3B = VSUB(T3z, T3A);
548
 
               T4n = VADD(T3z, T3A);
549
 
               T3C = VADD(T1w, T1y);
550
 
               T3D = VADD(T1r, T1t);
551
 
               T3E = VSUB(T3C, T3D);
552
 
               T4o = VADD(T3D, T3C);
553
 
               T1p = VSUB(T1m, T1o);
554
 
               T1N = VSUB(T1H, T1M);
555
 
               T1u = VSUB(T1r, T1t);
556
 
               T1z = VSUB(T1w, T1y);
557
 
               T1A = VMUL(LDK(KP707106781), VADD(T1u, T1z));
558
 
               T1C = VMUL(LDK(KP707106781), VSUB(T1z, T1u));
559
 
               T1B = VADD(T1p, T1A);
560
 
               T2S = VADD(T1N, T1C);
561
 
               T1O = VSUB(T1C, T1N);
562
 
               T2R = VSUB(T1p, T1A);
563
 
          }
564
 
          {
565
 
               V TD, T1R, T1b, T29, T1g, T2b, TI, T1T, TO, T1Y, T10, T22, T15, T24, TT;
566
 
               V T1W, TJ, TU, T16, T1h, T3J, T3K, T3G, T3H, T3u, T3v, T3r, T3s, T25, T2c;
567
 
               V T20, T27, T1U, T1Z;
568
 
               {
569
 
                    V TA, TC, TB, Tz, T1Q, T18, T1a, T19, T17, T28, T1d, T1f, T1e, T1c, T2a;
570
 
                    V TF, TH, TG, TE, T1S, TL, TN, TM, TK, T1X, TX, TZ, TY, TW, T21;
571
 
                    V T12, T14, T13, T11, T23, TQ, TS, TR, TP, T1V;
572
 
                    TA = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
573
 
                    TB = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
574
 
                    TC = VCONJ(TB);
575
 
                    Tz = LDW(&(W[TWVL * 2]));
576
 
                    TD = VZMULJ(Tz, VADD(TA, TC));
577
 
                    T1Q = LDW(&(W[TWVL * 4]));
578
 
                    T1R = VZMULIJ(T1Q, VSUB(TC, TA));
579
 
                    T18 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
580
 
                    T19 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
581
 
                    T1a = VCONJ(T19);
582
 
                    T17 = LDW(&(W[TWVL * 10]));
583
 
                    T1b = VZMULJ(T17, VADD(T18, T1a));
584
 
                    T28 = LDW(&(W[TWVL * 12]));
585
 
                    T29 = VZMULIJ(T28, VSUB(T1a, T18));
586
 
                    T1d = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
587
 
                    T1e = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
588
 
                    T1f = VCONJ(T1e);
589
 
                    T1c = LDW(&(W[TWVL * 42]));
590
 
                    T1g = VZMULJ(T1c, VADD(T1d, T1f));
591
 
                    T2a = LDW(&(W[TWVL * 44]));
592
 
                    T2b = VZMULIJ(T2a, VSUB(T1f, T1d));
593
 
                    TF = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
594
 
                    TG = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
595
 
                    TH = VCONJ(TG);
596
 
                    TE = LDW(&(W[TWVL * 34]));
597
 
                    TI = VZMULJ(TE, VADD(TF, TH));
598
 
                    T1S = LDW(&(W[TWVL * 36]));
599
 
                    T1T = VZMULIJ(T1S, VSUB(TH, TF));
600
 
                    TL = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
601
 
                    TM = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
602
 
                    TN = VCONJ(TM);
603
 
                    TK = LDW(&(W[TWVL * 18]));
604
 
                    TO = VZMULJ(TK, VADD(TL, TN));
605
 
                    T1X = LDW(&(W[TWVL * 20]));
606
 
                    T1Y = VZMULIJ(T1X, VSUB(TN, TL));
607
 
                    TX = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
608
 
                    TY = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
609
 
                    TZ = VCONJ(TY);
610
 
                    TW = LDW(&(W[TWVL * 58]));
611
 
                    T10 = VZMULJ(TW, VADD(TX, TZ));
612
 
                    T21 = LDW(&(W[TWVL * 60]));
613
 
                    T22 = VZMULIJ(T21, VSUB(TZ, TX));
614
 
                    T12 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
615
 
                    T13 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
616
 
                    T14 = VCONJ(T13);
617
 
                    T11 = LDW(&(W[TWVL * 26]));
618
 
                    T15 = VZMULJ(T11, VADD(T12, T14));
619
 
                    T23 = LDW(&(W[TWVL * 28]));
620
 
                    T24 = VZMULIJ(T23, VSUB(T14, T12));
621
 
                    TQ = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
622
 
                    TR = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
623
 
                    TS = VCONJ(TR);
624
 
                    TP = LDW(&(W[TWVL * 50]));
625
 
                    TT = VZMULJ(TP, VADD(TQ, TS));
626
 
                    T1V = LDW(&(W[TWVL * 52]));
627
 
                    T1W = VZMULIJ(T1V, VSUB(TS, TQ));
628
 
               }
629
 
               TJ = VSUB(TD, TI);
630
 
               TU = VSUB(TO, TT);
631
 
               TV = VFNMS(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TJ));
632
 
               T2p = VFMA(LDK(KP382683432), TJ, VMUL(LDK(KP923879532), TU));
633
 
               T16 = VSUB(T10, T15);
634
 
               T1h = VSUB(T1b, T1g);
635
 
               T1i = VFMA(LDK(KP923879532), T16, VMUL(LDK(KP382683432), T1h));
636
 
               T2o = VFNMS(LDK(KP923879532), T1h, VMUL(LDK(KP382683432), T16));
637
 
               T3J = VADD(T1Y, T1W);
638
 
               T3K = VADD(T1R, T1T);
639
 
               T3L = VSUB(T3J, T3K);
640
 
               T4q = VADD(T3K, T3J);
641
 
               T3G = VADD(T22, T24);
642
 
               T3H = VADD(T29, T2b);
643
 
               T3I = VSUB(T3G, T3H);
644
 
               T4r = VADD(T3G, T3H);
645
 
               T3u = VADD(T10, T15);
646
 
               T3v = VADD(T1b, T1g);
647
 
               T3w = VSUB(T3u, T3v);
648
 
               T4k = VADD(T3u, T3v);
649
 
               T3r = VADD(TD, TI);
650
 
               T3s = VADD(TO, TT);
651
 
               T3t = VSUB(T3r, T3s);
652
 
               T4j = VADD(T3r, T3s);
653
 
               T25 = VSUB(T22, T24);
654
 
               T2c = VSUB(T29, T2b);
655
 
               T1U = VSUB(T1R, T1T);
656
 
               T1Z = VSUB(T1W, T1Y);
657
 
               T20 = VMUL(LDK(KP707106781), VADD(T1U, T1Z));
658
 
               T27 = VMUL(LDK(KP707106781), VSUB(T1Z, T1U));
659
 
               T26 = VADD(T20, T25);
660
 
               T2V = VADD(T27, T2c);
661
 
               T2d = VSUB(T27, T2c);
662
 
               T2U = VSUB(T25, T20);
663
 
          }
664
 
          {
665
 
               V T4m, T4w, T4t, T4x, T4i, T4l, T4p, T4s, T4u, T4z, T4v, T4y, T4E, T4L, T4H;
666
 
               V T4K, T4A, T4F, T4D, T4G, T4B, T4C, T4I, T4N, T4J, T4M, T3O, T4c, T4d, T3X;
667
 
               V T40, T46, T49, T41, T3y, T47, T3T, T45, T3N, T44, T3W, T48, T3x, T3S, T3F;
668
 
               V T3M, T3U, T3V, T3Y, T4e, T4f, T3Z, T42, T4a, T4b, T43;
669
 
               T4i = VADD(T4g, T4h);
670
 
               T4l = VADD(T4j, T4k);
671
 
               T4m = VADD(T4i, T4l);
672
 
               T4w = VSUB(T4i, T4l);
673
 
               T4p = VADD(T4n, T4o);
674
 
               T4s = VADD(T4q, T4r);
675
 
               T4t = VADD(T4p, T4s);
676
 
               T4x = VBYI(VSUB(T4s, T4p));
677
 
               T4u = VCONJ(VMUL(LDK(KP500000000), VSUB(T4m, T4t)));
678
 
               ST(&(Rm[WS(rs, 15)]), T4u, -ms, &(Rm[WS(rs, 1)]));
679
 
               T4z = VMUL(LDK(KP500000000), VADD(T4w, T4x));
680
 
               ST(&(Rp[WS(rs, 8)]), T4z, ms, &(Rp[0]));
681
 
               T4v = VMUL(LDK(KP500000000), VADD(T4m, T4t));
682
 
               ST(&(Rp[0]), T4v, ms, &(Rp[0]));
683
 
               T4y = VCONJ(VMUL(LDK(KP500000000), VSUB(T4w, T4x)));
684
 
               ST(&(Rm[WS(rs, 7)]), T4y, -ms, &(Rm[WS(rs, 1)]));
685
 
               T4A = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
686
 
               T4F = VSUB(T4k, T4j);
687
 
               T4B = VSUB(T4n, T4o);
688
 
               T4C = VSUB(T4r, T4q);
689
 
               T4D = VMUL(LDK(KP353553390), VADD(T4B, T4C));
690
 
               T4G = VMUL(LDK(KP707106781), VSUB(T4C, T4B));
691
 
               T4E = VADD(T4A, T4D);
692
 
               T4L = VMUL(LDK(KP500000000), VBYI(VSUB(T4G, T4F)));
693
 
               T4H = VMUL(LDK(KP500000000), VBYI(VADD(T4F, T4G)));
694
 
               T4K = VSUB(T4A, T4D);
695
 
               T4I = VCONJ(VSUB(T4E, T4H));
696
 
               ST(&(Rm[WS(rs, 3)]), T4I, -ms, &(Rm[WS(rs, 1)]));
697
 
               T4N = VADD(T4K, T4L);
698
 
               ST(&(Rp[WS(rs, 12)]), T4N, ms, &(Rp[0]));
699
 
               T4J = VADD(T4E, T4H);
700
 
               ST(&(Rp[WS(rs, 4)]), T4J, ms, &(Rp[0]));
701
 
               T4M = VCONJ(VSUB(T4K, T4L));
702
 
               ST(&(Rm[WS(rs, 11)]), T4M, -ms, &(Rm[WS(rs, 1)]));
703
 
               T3x = VMUL(LDK(KP353553390), VADD(T3t, T3w));
704
 
               T3y = VADD(T3q, T3x);
705
 
               T47 = VSUB(T3q, T3x);
706
 
               T3S = VMUL(LDK(KP707106781), VSUB(T3w, T3t));
707
 
               T3T = VADD(T3R, T3S);
708
 
               T45 = VSUB(T3S, T3R);
709
 
               T3F = VFMA(LDK(KP923879532), T3B, VMUL(LDK(KP382683432), T3E));
710
 
               T3M = VFNMS(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3I));
711
 
               T3N = VMUL(LDK(KP500000000), VADD(T3F, T3M));
712
 
               T44 = VSUB(T3M, T3F);
713
 
               T3U = VFNMS(LDK(KP382683432), T3B, VMUL(LDK(KP923879532), T3E));
714
 
               T3V = VFMA(LDK(KP923879532), T3L, VMUL(LDK(KP382683432), T3I));
715
 
               T3W = VADD(T3U, T3V);
716
 
               T48 = VMUL(LDK(KP500000000), VSUB(T3V, T3U));
717
 
               T3O = VADD(T3y, T3N);
718
 
               T4c = VMUL(LDK(KP500000000), VBYI(VADD(T45, T44)));
719
 
               T4d = VADD(T47, T48);
720
 
               T3X = VMUL(LDK(KP500000000), VBYI(VADD(T3T, T3W)));
721
 
               T40 = VSUB(T3y, T3N);
722
 
               T46 = VMUL(LDK(KP500000000), VBYI(VSUB(T44, T45)));
723
 
               T49 = VSUB(T47, T48);
724
 
               T41 = VMUL(LDK(KP500000000), VBYI(VSUB(T3W, T3T)));
725
 
               T3Y = VCONJ(VSUB(T3O, T3X));
726
 
               ST(&(Rm[WS(rs, 1)]), T3Y, -ms, &(Rm[WS(rs, 1)]));
727
 
               T4e = VADD(T4c, T4d);
728
 
               ST(&(Rp[WS(rs, 6)]), T4e, ms, &(Rp[0]));
729
 
               T4f = VCONJ(VSUB(T4d, T4c));
730
 
               ST(&(Rm[WS(rs, 5)]), T4f, -ms, &(Rm[WS(rs, 1)]));
731
 
               T3Z = VADD(T3O, T3X);
732
 
               ST(&(Rp[WS(rs, 2)]), T3Z, ms, &(Rp[0]));
733
 
               T42 = VCONJ(VSUB(T40, T41));
734
 
               ST(&(Rm[WS(rs, 13)]), T42, -ms, &(Rm[WS(rs, 1)]));
735
 
               T4a = VADD(T46, T49);
736
 
               ST(&(Rp[WS(rs, 10)]), T4a, ms, &(Rp[0]));
737
 
               T4b = VCONJ(VSUB(T49, T46));
738
 
               ST(&(Rm[WS(rs, 9)]), T4b, -ms, &(Rm[WS(rs, 1)]));
739
 
               T43 = VADD(T40, T41);
740
 
               ST(&(Rp[WS(rs, 14)]), T43, ms, &(Rp[0]));
741
 
               {
742
 
                    V T2g, T2K, T2L, T2v, T2y, T2E, T2H, T2z, T1k, T2F, T2u, T2G, T2f, T2C, T2r;
743
 
                    V T2D, Ty, T1j, T2s, T2t, T1P, T2e, T2n, T2q, T2w, T2M, T2N, T2x, T2A, T2I;
744
 
                    V T2J, T2B;
745
 
                    Ty = VADD(Ta, Tx);
746
 
                    T1j = VMUL(LDK(KP500000000), VADD(TV, T1i));
747
 
                    T1k = VADD(Ty, T1j);
748
 
                    T2F = VSUB(Ty, T1j);
749
 
                    T2s = VFNMS(LDK(KP195090322), T1B, VMUL(LDK(KP980785280), T1O));
750
 
                    T2t = VFMA(LDK(KP195090322), T26, VMUL(LDK(KP980785280), T2d));
751
 
                    T2u = VADD(T2s, T2t);
752
 
                    T2G = VMUL(LDK(KP500000000), VSUB(T2t, T2s));
753
 
                    T1P = VFMA(LDK(KP980785280), T1B, VMUL(LDK(KP195090322), T1O));
754
 
                    T2e = VFNMS(LDK(KP195090322), T2d, VMUL(LDK(KP980785280), T26));
755
 
                    T2f = VMUL(LDK(KP500000000), VADD(T1P, T2e));
756
 
                    T2C = VSUB(T2e, T1P);
757
 
                    T2n = VSUB(T2h, T2m);
758
 
                    T2q = VSUB(T2o, T2p);
759
 
                    T2r = VADD(T2n, T2q);
760
 
                    T2D = VSUB(T2q, T2n);
761
 
                    T2g = VADD(T1k, T2f);
762
 
                    T2K = VMUL(LDK(KP500000000), VBYI(VADD(T2D, T2C)));
763
 
                    T2L = VADD(T2F, T2G);
764
 
                    T2v = VMUL(LDK(KP500000000), VBYI(VADD(T2r, T2u)));
765
 
                    T2y = VSUB(T1k, T2f);
766
 
                    T2E = VMUL(LDK(KP500000000), VBYI(VSUB(T2C, T2D)));
767
 
                    T2H = VSUB(T2F, T2G);
768
 
                    T2z = VMUL(LDK(KP500000000), VBYI(VSUB(T2u, T2r)));
769
 
                    T2w = VCONJ(VSUB(T2g, T2v));
770
 
                    ST(&(Rm[0]), T2w, -ms, &(Rm[0]));
771
 
                    T2M = VADD(T2K, T2L);
772
 
                    ST(&(Rp[WS(rs, 7)]), T2M, ms, &(Rp[WS(rs, 1)]));
773
 
                    T2N = VCONJ(VSUB(T2L, T2K));
774
 
                    ST(&(Rm[WS(rs, 6)]), T2N, -ms, &(Rm[0]));
775
 
                    T2x = VADD(T2g, T2v);
776
 
                    ST(&(Rp[WS(rs, 1)]), T2x, ms, &(Rp[WS(rs, 1)]));
777
 
                    T2A = VCONJ(VSUB(T2y, T2z));
778
 
                    ST(&(Rm[WS(rs, 14)]), T2A, -ms, &(Rm[0]));
779
 
                    T2I = VADD(T2E, T2H);
780
 
                    ST(&(Rp[WS(rs, 9)]), T2I, ms, &(Rp[WS(rs, 1)]));
781
 
                    T2J = VCONJ(VSUB(T2H, T2E));
782
 
                    ST(&(Rm[WS(rs, 8)]), T2J, -ms, &(Rm[0]));
783
 
                    T2B = VADD(T2y, T2z);
784
 
                    ST(&(Rp[WS(rs, 15)]), T2B, ms, &(Rp[WS(rs, 1)]));
785
 
               }
786
 
               {
787
 
                    V T2Y, T3k, T3l, T35, T38, T3e, T3h, T39, T2Q, T3f, T34, T3g, T2X, T3c, T31;
788
 
                    V T3d, T2O, T2P, T32, T33, T2T, T2W, T2Z, T30, T36, T3m, T3n, T37, T3a, T3i;
789
 
                    V T3j, T3b;
790
 
                    T2O = VSUB(Ta, Tx);
791
 
                    T2P = VMUL(LDK(KP500000000), VADD(T2p, T2o));
792
 
                    T2Q = VADD(T2O, T2P);
793
 
                    T3f = VSUB(T2O, T2P);
794
 
                    T32 = VFNMS(LDK(KP555570233), T2R, VMUL(LDK(KP831469612), T2S));
795
 
                    T33 = VFMA(LDK(KP555570233), T2U, VMUL(LDK(KP831469612), T2V));
796
 
                    T34 = VADD(T32, T33);
797
 
                    T3g = VMUL(LDK(KP500000000), VSUB(T33, T32));
798
 
                    T2T = VFMA(LDK(KP831469612), T2R, VMUL(LDK(KP555570233), T2S));
799
 
                    T2W = VFNMS(LDK(KP555570233), T2V, VMUL(LDK(KP831469612), T2U));
800
 
                    T2X = VMUL(LDK(KP500000000), VADD(T2T, T2W));
801
 
                    T3c = VSUB(T2W, T2T);
802
 
                    T2Z = VADD(T2m, T2h);
803
 
                    T30 = VSUB(T1i, TV);
804
 
                    T31 = VADD(T2Z, T30);
805
 
                    T3d = VSUB(T30, T2Z);
806
 
                    T2Y = VADD(T2Q, T2X);
807
 
                    T3k = VMUL(LDK(KP500000000), VBYI(VADD(T3d, T3c)));
808
 
                    T3l = VADD(T3f, T3g);
809
 
                    T35 = VMUL(LDK(KP500000000), VBYI(VADD(T31, T34)));
810
 
                    T38 = VSUB(T2Q, T2X);
811
 
                    T3e = VMUL(LDK(KP500000000), VBYI(VSUB(T3c, T3d)));
812
 
                    T3h = VSUB(T3f, T3g);
813
 
                    T39 = VMUL(LDK(KP500000000), VBYI(VSUB(T34, T31)));
814
 
                    T36 = VCONJ(VSUB(T2Y, T35));
815
 
                    ST(&(Rm[WS(rs, 2)]), T36, -ms, &(Rm[0]));
816
 
                    T3m = VADD(T3k, T3l);
817
 
                    ST(&(Rp[WS(rs, 5)]), T3m, ms, &(Rp[WS(rs, 1)]));
818
 
                    T3n = VCONJ(VSUB(T3l, T3k));
819
 
                    ST(&(Rm[WS(rs, 4)]), T3n, -ms, &(Rm[0]));
820
 
                    T37 = VADD(T2Y, T35);
821
 
                    ST(&(Rp[WS(rs, 3)]), T37, ms, &(Rp[WS(rs, 1)]));
822
 
                    T3a = VCONJ(VSUB(T38, T39));
823
 
                    ST(&(Rm[WS(rs, 12)]), T3a, -ms, &(Rm[0]));
824
 
                    T3i = VADD(T3e, T3h);
825
 
                    ST(&(Rp[WS(rs, 11)]), T3i, ms, &(Rp[WS(rs, 1)]));
826
 
                    T3j = VCONJ(VSUB(T3h, T3e));
827
 
                    ST(&(Rm[WS(rs, 10)]), T3j, -ms, &(Rm[0]));
828
 
                    T3b = VADD(T38, T39);
829
 
                    ST(&(Rp[WS(rs, 13)]), T3b, ms, &(Rp[WS(rs, 1)]));
830
 
               }
831
 
          }
832
 
     }
833
 
}
834
 
 
835
 
static const tw_instr twinstr[] = {
836
 
     VTW(1, 1),
837
 
     VTW(1, 2),
838
 
     VTW(1, 3),
839
 
     VTW(1, 4),
840
 
     VTW(1, 5),
841
 
     VTW(1, 6),
842
 
     VTW(1, 7),
843
 
     VTW(1, 8),
844
 
     VTW(1, 9),
845
 
     VTW(1, 10),
846
 
     VTW(1, 11),
847
 
     VTW(1, 12),
848
 
     VTW(1, 13),
849
 
     VTW(1, 14),
850
 
     VTW(1, 15),
851
 
     VTW(1, 16),
852
 
     VTW(1, 17),
853
 
     VTW(1, 18),
854
 
     VTW(1, 19),
855
 
     VTW(1, 20),
856
 
     VTW(1, 21),
857
 
     VTW(1, 22),
858
 
     VTW(1, 23),
859
 
     VTW(1, 24),
860
 
     VTW(1, 25),
861
 
     VTW(1, 26),
862
 
     VTW(1, 27),
863
 
     VTW(1, 28),
864
 
     VTW(1, 29),
865
 
     VTW(1, 30),
866
 
     VTW(1, 31),
867
 
     {TW_NEXT, VL, 0}
868
 
};
869
 
 
870
 
static const hc2c_desc desc = { 32, "hc2cfdftv_32", twinstr, &GENUS, {233, 117, 16, 0} };
871
 
 
872
 
void X(codelet_hc2cfdftv_32) (planner *p) {
873
 
     X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
874
 
}
875
 
#endif                          /* HAVE_FMA */